Diff 558095

clang/lib/CodeGen/Targets/AMDGPU.cpp

Show First 20 Lines • Show All 509 Lines • ▼ Show 20 Lines
}		}

void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(		void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(
const FunctionType *&FT) const {		const FunctionType *&FT) const {
FT = getABIInfo().getContext().adjustFunctionType(		FT = getABIInfo().getContext().adjustFunctionType(
FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));		FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
}		}

		/// Return IR struct type corresponding to kernel_descriptor_t (See
		/// AMDHSAKernelDescriptor.h)
		static llvm::StructType *getAMDGPUKernelDescriptorType(CodeGenFunction &CGF) {
		return llvm::StructType::create(
		barannikov88Unsubmitted Done Reply Inline Actions Minor suggestion: you can get these types from CGF / CGM (Int8Ty etc.) barannikov88: Minor suggestion: you can get these types from CGF / CGM (Int8Ty etc.)
		CGF.getLLVMContext(),
		{
		CGF.Int32Ty, // group_segment_fixed_size
		CGF.Int32Ty, // private_segment_fixed_size
		CGF.Int32Ty, // kernarg_size
		llvm::ArrayType::get(CGF.Int8Ty, 4), // reserved0
		CGF.Int64Ty, // kernel_code_entry_byte_offset
		llvm::ArrayType::get(CGF.Int8Ty, 20), // reserved1
		CGF.Int32Ty, // compute_pgm_rsrc3
		CGF.Int32Ty, // compute_pgm_rsrc1
		CGF.Int32Ty, // compute_pgm_rsrc2
		CGF.Int16Ty, // kernel_code_properties
		llvm::ArrayType::get(CGF.Int8Ty, 6) // reserved2
		},
		"kernel_descriptor_t");
		}

		/// Return IR struct type for rtinfo struct in rocm-device-libs used for device
		/// enqueue.
		///
		/// ptr addrspace(1) kernel_object, i32 private_segment_size,
		/// i32 group_segment_size

		static llvm::StructType *
		getAMDGPURuntimeHandleType(llvm::LLVMContext &C,
		llvm::Type *KernelDescriptorPtrTy) {
		llvm::Type *Int32 = llvm::Type::getInt32Ty(C);
		return llvm::StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
		"block.runtime.handle.t");
		}

/// Create an OpenCL kernel for an enqueued block.		/// Create an OpenCL kernel for an enqueued block.
///		///
/// The type of the first argument (the block literal) is the struct type		/// The type of the first argument (the block literal) is the struct type
/// of the block literal instead of a pointer type. The first argument		/// of the block literal instead of a pointer type. The first argument
/// (block literal) is passed directly by value to the kernel. The kernel		/// (block literal) is passed directly by value to the kernel. The kernel
/// allocates the same type of struct on stack and stores the block literal		/// allocates the same type of struct on stack and stores the block literal
/// to it and passes its pointer to the block invoke function. The kernel		/// to it and passes its pointer to the block invoke function. The kernel
/// has "enqueued-block" function attribute and kernel argument metadata.		/// has "enqueued-block" function attribute and kernel argument metadata.
Show All 23 Lines	for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));		ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));		AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
AccessQuals.push_back(llvm::MDString::get(C, "none"));		AccessQuals.push_back(llvm::MDString::get(C, "none"));
ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));		ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
ArgTypeQuals.push_back(llvm::MDString::get(C, ""));		ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
ArgNames.push_back(		ArgNames.push_back(
llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));		llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
}		}
std::string Name = Invoke->getName().str() + "_kernel";
		llvm::Module &Mod = CGF.CGM.getModule();
		const llvm::DataLayout &DL = Mod.getDataLayout();

		llvm::Twine Name = Invoke->getName() + "_kernel";
auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);		auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);

		// The kernel itself can be internal, the runtime does not directly access the
		// kernel address (only the kernel descriptor).
auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,		auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
&CGF.CGM.getModule());		&Mod);
F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);		F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);

llvm::AttrBuilder KernelAttrs(C);		llvm::AttrBuilder KernelAttrs(C);
// FIXME: The invoke isn't applying the right attributes either		// FIXME: The invoke isn't applying the right attributes either
// FIXME: This is missing setTargetAttributes		// FIXME: This is missing setTargetAttributes
CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);		CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);
KernelAttrs.addAttribute("enqueued-block");
F->addFnAttrs(KernelAttrs);		F->addFnAttrs(KernelAttrs);

auto IP = CGF.Builder.saveIP();		auto IP = CGF.Builder.saveIP();
auto *BB = llvm::BasicBlock::Create(C, "entry", F);		auto *BB = llvm::BasicBlock::Create(C, "entry", F);
Builder.SetInsertPoint(BB);		Builder.SetInsertPoint(BB);
const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);		const auto BlockAlign = DL.getPrefTypeAlign(BlockTy);
auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);		auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
BlockPtr->setAlignment(BlockAlign);		BlockPtr->setAlignment(BlockAlign);
Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);		Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));		auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
llvm::SmallVector<llvm::Value *, 2> Args;		llvm::SmallVector<llvm::Value *, 2> Args;
Args.push_back(Cast);		Args.push_back(Cast);
for (llvm::Argument &A : llvm::drop_begin(F->args()))		for (llvm::Argument &A : llvm::drop_begin(F->args()))
Args.push_back(&A);		Args.push_back(&A);
llvm::CallInst *call = Builder.CreateCall(Invoke, Args);		llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
call->setCallingConv(Invoke->getCallingConv());		call->setCallingConv(Invoke->getCallingConv());
Builder.CreateRetVoid();		Builder.CreateRetVoid();
Builder.restoreIP(IP);		Builder.restoreIP(IP);

F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));		F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));		F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));		F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
F->setMetadata("kernel_arg_base_type",		F->setMetadata("kernel_arg_base_type",
llvm::MDNode::get(C, ArgBaseTypeNames));		llvm::MDNode::get(C, ArgBaseTypeNames));
F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));		F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)		if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));		F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));

return F;		llvm::Type *KernelDescriptorTy = getAMDGPUKernelDescriptorType(CGF);
		llvm::StructType *HandleTy = getAMDGPURuntimeHandleType(
		C, KernelDescriptorTy->getPointerTo(DL.getDefaultGlobalsAddressSpace()));
		llvm::Constant *RuntimeHandleInitializer =
		llvm::ConstantAggregateZero::get(HandleTy);

		llvm::Twine RuntimeHandleName = F->getName() + ".runtime.handle";

		// The runtime needs access to the runtime handle as an external symbol. The
		// runtime handle will need to be made external later, in
		// AMDGPUExportOpenCLEnqueuedBlocks. The kernel itself has a hidden reference
		// inside the runtime handle, and is not directly referenced.

		// TODO: We would initialize the first field by declaring F->getName() + ".kd"
		// to reference the kernel descriptor. The runtime wouldn't need to bother
		// setting it. We would need to have a final symbol name though.
		// TODO: Can we directly use an external symbol with getGlobalIdentifier?
		auto *RuntimeHandle = new llvm::GlobalVariable(
		Mod, HandleTy,
		/isConstant=/true, llvm::GlobalValue::InternalLinkage,
		/Initializer=/RuntimeHandleInitializer, RuntimeHandleName,
		/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
		DL.getDefaultGlobalsAddressSpace(),
		/isExternallyInitialized=/true);

		llvm::MDNode *HandleAsMD =
		llvm::MDNode::get(C, llvm::ValueAsMetadata::get(RuntimeHandle));
		F->setMetadata(llvm::LLVMContext::MD_associated, HandleAsMD);

		RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");

		CGF.CGM.addUsedGlobal(F);
		CGF.CGM.addUsedGlobal(RuntimeHandle);
		return RuntimeHandle;
}		}

void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(		void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
llvm::Function F, const AMDGPUFlatWorkGroupSizeAttr FlatWGS,		llvm::Function F, const AMDGPUFlatWorkGroupSizeAttr FlatWGS,
const ReqdWorkGroupSizeAttr ReqdWGS, int32_t MinThreadsVal,		const ReqdWorkGroupSizeAttr ReqdWGS, int32_t MinThreadsVal,
int32_t *MaxThreadsVal) {		int32_t *MaxThreadsVal) {
unsigned Min = 0;		unsigned Min = 0;
unsigned Max = 0;		unsigned Max = 0;
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl

This file was added.

				// Make sure that invoking blocks in static functions with the same name in
				// different modules are linked together.

				// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_first -DTYPE=float -DCONST=256.0f -emit-llvm-bc -o %t.0.bc %s
				// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -fno-ident -DKERNEL_NAME=test_kernel_second -DTYPE=int -DCONST=128.0f -emit-llvm-bc -o %t.1.bc %s

				// Make sure nothing strange happens with the linkage choices.
				// RUN: opt -passes=globalopt -o %t.opt.0.bc %t.0.bc
				// RUN: opt -passes=globalopt -o %t.opt.1.bc %t.1.bc

				// Check the result of linking
				// RUN: llvm-link -S %t.opt.0.bc %t.opt.1.bc -o - \| FileCheck %s

				// Make sure that a block invoke used with the same name works in multiple
				// translation units

				// CHECK: @llvm.used = appending addrspace(1) global [4 x ptr] [ptr @__static_invoker_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr @__static_invoker_block_invoke_kernel.2, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr)], section "llvm.metadata"


				// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
				// CHECK: @__static_invoker_block_invoke_kernel.runtime.handle.3 = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"

				// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_FIRST_MD:[0-9]+]]


				// CHECK-LABEL: define internal void @__static_invoker_block_invoke(ptr noundef %.block_descriptor)
				// CHECK: call float @llvm.fmuladd.f32


				// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_first(


				// CHECK-LABEL: define internal fastcc void @static_invoker(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr)
				// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) byval(%struct.ndrange_t) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle to ptr), ptr %{{[0-9]+}})

				// CHECK: declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr

				// CHECK: define internal amdgpu_kernel void @__static_invoker_block_invoke_kernel.2(<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1) }> %0) #{{[0-9]+}} !associated ![[ASSOC_SECOND_MD:[0-9]+]]
				// CHECK: call void @__static_invoker_block_invoke.4(ptr %


				// CHECK-LABEL: define internal void @__static_invoker_block_invoke.4(ptr noundef %.block_descriptor)
				// CHECK: mul nsw i32
				// CHECK: sitofp
				// CHECK: fadd
				// CHECK: fptosi

				// CHECK-LABEL: define dso_local amdgpu_kernel void @test_kernel_second(ptr addrspace(1) noundef align 4 %outptr, ptr addrspace(1) noundef align 4 %argptr, ptr addrspace(1) noundef align 4 %difference)

				// CHECK-LABEL: define internal fastcc void @static_invoker.5(ptr addrspace(1) noundef %outptr, ptr addrspace(1) noundef %argptr) unnamed_addr #{{[0-9]+}} {
				// CHECK: call i32 @__enqueue_kernel_basic(ptr addrspace(1) %{{[0-9]+}}, i32 %{{[0-9]+}}, ptr addrspace(5) byval(%struct.ndrange_t) %tmp, ptr addrspacecast (ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3 to ptr), ptr %{{[0-9]+}})

				typedef struct {int a;} ndrange_t;

				static void static_invoker(global TYPE* outptr, global TYPE* argptr) {
				queue_t default_queue;
				unsigned flags = 0;
				ndrange_t ndrange;

				enqueue_kernel(default_queue, flags, ndrange,
				^(void) {
				global TYPE* f = argptr;
				outptr[0] = f[1] * f[2] + CONST;
				});
				}

				kernel void KERNEL_NAME(global TYPE outptr, global TYPE argptr, global TYPE *difference) {
				queue_t default_queue;
				unsigned flags = 0;
				ndrange_t ndrange;

				static_invoker(outptr, argptr);

				*difference = CONST;
				}

				// CHECK: ![[ASSOC_FIRST_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle}
				// CHECK: ![[ASSOC_SECOND_MD]] = !{ptr addrspace(1) @__static_invoker_block_invoke_kernel.runtime.handle.3}

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	kernel void test_target_features_kernel(global int *i) {

enqueue_kernel(default_queue, flags, ndrange,		enqueue_kernel(default_queue, flags, ndrange,
^(void) {		^(void) {
__builtin_amdgcn_s_memtime();		__builtin_amdgcn_s_memtime();
});		});
}		}

//.		//.
		// CHECK: @__test_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
		// CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
		// CHECK: @__test_block_invoke_3_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
		// CHECK: @__test_block_invoke_4_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.5 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
// CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0		// CHECK: @__block_literal_global = internal addrspace(1) constant { i32, i32, ptr } { i32 16, i32 8, ptr @__test_target_features_kernel_block_invoke }, align 8 #0
		// CHECK: @__test_target_features_kernel_block_invoke_kernel.runtime.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t.7 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
		// CHECK: @llvm.used = appending addrspace(1) global [10 x ptr] [ptr @__test_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr @__test_block_invoke_2_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr @__test_block_invoke_3_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr @__test_block_invoke_4_kernel, ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr)], section "llvm.metadata"
//.		//.
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone		// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@callee		// NOCPU-LABEL: define {{[^@]+}}@callee
// NOCPU-SAME: (i64 noundef [[ID:%.]], ptr addrspace(1) noundef [[OUT:%.]]) #[[ATTR1:[0-9]+]] {		// NOCPU-SAME: (i64 noundef [[ID:%.]], ptr addrspace(1) noundef [[OUT:%.]]) #[[ATTR1:[0-9]+]] {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)		// NOCPU-NEXT: [[ID_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
// NOCPU-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)		// NOCPU-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// NOCPU-NEXT: store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8		// NOCPU-NEXT: store i64 [[ID]], ptr addrspace(5) [[ID_ADDR]], align 8
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
// NOCPU-NEXT: store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8		// NOCPU-NEXT: store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3		// NOCPU-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
// NOCPU-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8		// NOCPU-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1		// NOCPU-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
// NOCPU-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8		// NOCPU-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
// NOCPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr		// NOCPU-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
// NOCPU-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])		// NOCPU-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[TMP4]])
// NOCPU-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8		// NOCPU-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4		// NOCPU-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)		// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
// NOCPU-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0		// NOCPU-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
// NOCPU-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8		// NOCPU-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1		// NOCPU-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4		// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
// NOCPU-NEXT: [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2		// NOCPU-NEXT: [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
// NOCPU-NEXT: store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8		// NOCPU-NEXT: store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3		// NOCPU-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
// NOCPU-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8		// NOCPU-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6		// NOCPU-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
// NOCPU-NEXT: [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1		// NOCPU-NEXT: [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
// NOCPU-NEXT: store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8		// NOCPU-NEXT: store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
// NOCPU-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8		// NOCPU-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5		// NOCPU-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
// NOCPU-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8		// NOCPU-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
// NOCPU-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8		// NOCPU-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
// NOCPU-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr		// NOCPU-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
// NOCPU-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])		// NOCPU-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[TMP12]])
// NOCPU-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8		// NOCPU-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4		// NOCPU-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)		// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
// NOCPU-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0		// NOCPU-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
// NOCPU-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8		// NOCPU-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1		// NOCPU-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4		// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
// NOCPU-NEXT: [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2		// NOCPU-NEXT: [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
// NOCPU-NEXT: store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8		// NOCPU-NEXT: store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3		// NOCPU-NEXT: [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
// NOCPU-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8		// NOCPU-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6		// NOCPU-NEXT: [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
// NOCPU-NEXT: [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1		// NOCPU-NEXT: [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1
// NOCPU-NEXT: store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8		// NOCPU-NEXT: store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
// NOCPU-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8		// NOCPU-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5		// NOCPU-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
// NOCPU-NEXT: [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8		// NOCPU-NEXT: [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
// NOCPU-NEXT: store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8		// NOCPU-NEXT: store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8
// NOCPU-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr		// NOCPU-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
// NOCPU-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0		// NOCPU-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8		// NOCPU-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8
// NOCPU-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])		// NOCPU-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0		// NOCPU-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
// NOCPU-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8		// NOCPU-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1		// NOCPU-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4		// NOCPU-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
// NOCPU-NEXT: [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2		// NOCPU-NEXT: [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
// NOCPU-NEXT: store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8		// NOCPU-NEXT: store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3		// NOCPU-NEXT: [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
// NOCPU-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8		// NOCPU-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8
// NOCPU-NEXT: store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8		// NOCPU-NEXT: store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
// NOCPU-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8		// NOCPU-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8
// NOCPU-NEXT: store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8
// NOCPU-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr		// NOCPU-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// NOCPU-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8		// NOCPU-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8
// NOCPU-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8		// NOCPU-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4		// NOCPU-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)		// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
// NOCPU-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8		// NOCPU-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8
// NOCPU-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr		// NOCPU-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// NOCPU-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])		// NOCPU-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[TMP28]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent noinline nounwind optnone		// NOCPU: Function Attrs: convergent noinline nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke		// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke
// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] {		// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4:[0-9]+]] {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8		// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8		// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
// NOCPU-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8		// NOCPU-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8
// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3		// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8		// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
// NOCPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0		// NOCPU-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
// NOCPU-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1		// NOCPU-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent nounwind		// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel		// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {		// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5:[0-9]+]] !associated [[META7:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)		// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// NOCPU-NEXT: call void @__test_block_invoke(ptr [[TMP2]])		// NOCPU-NEXT: call void @__test_block_invoke(ptr [[TMP2]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
Show All 17 Lines
// NOCPU-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8		// NOCPU-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8
// NOCPU-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0		// NOCPU-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
// NOCPU-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8		// NOCPU-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent nounwind		// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel		// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {		// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META12:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)		// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// NOCPU-NEXT: call void @__test_block_invoke_2(ptr [[TMP2]])		// NOCPU-NEXT: call void @__test_block_invoke_2(ptr [[TMP2]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
Show All 22 Lines
// NOCPU-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4		// NOCPU-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4
// NOCPU-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0		// NOCPU-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
// NOCPU-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4		// NOCPU-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent nounwind		// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel		// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.]], ptr addrspace(3) [[TMP1:%.]]) #[[ATTR5]] !kernel_arg_addr_space !11 !kernel_arg_access_qual !12 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !14 {		// NOCPU-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.]], ptr addrspace(3) [[TMP1:%.]]) #[[ATTR5]] !associated [[META13:![0-9]+]] !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)		// NOCPU-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8		// NOCPU-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
// NOCPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr		// NOCPU-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr
// NOCPU-NEXT: call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])		// NOCPU-NEXT: call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
Show All 10 Lines
// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4		// NOCPU-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8		// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8
// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]]		// NOCPU-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]]
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent nounwind		// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel		// NOCPU-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {		// NOCPU-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META18:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)		// NOCPU-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// NOCPU-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// NOCPU-NEXT: call void @__test_block_invoke_4(ptr [[TMP2]])		// NOCPU-NEXT: call void @__test_block_invoke_4(ptr [[TMP2]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone		// NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel		// NOCPU-LABEL: define {{[^@]+}}@test_target_features_kernel
// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !15 !kernel_arg_access_qual !8 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !10 {		// NOCPU-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !9 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)		// NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)		// NOCPU-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)		// NOCPU-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
// NOCPU-NEXT: [[NDRANGE:%.]] = alloca [[STRUCT_NDRANGE_T:%.]], align 4, addrspace(5)		// NOCPU-NEXT: [[NDRANGE:%.]] = alloca [[STRUCT_NDRANGE_T:%.]], align 4, addrspace(5)
// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)		// NOCPU-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8		// NOCPU-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8
// NOCPU-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4		// NOCPU-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()		// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8		// NOCPU-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4		// NOCPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4
// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)		// NOCPU-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false)
// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))		// NOCPU-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent noinline nounwind optnone		// NOCPU: Function Attrs: convergent noinline nounwind optnone
// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke		// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke
// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {		// NOCPU-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR4]] {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// NOCPU-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// NOCPU-NEXT: [[BLOCK_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8		// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8		// NOCPU-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[BLOCK_ADDR]], align 8
// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()		// NOCPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
// NOCPU: Function Attrs: convergent nounwind		// NOCPU: Function Attrs: convergent nounwind
// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel		// NOCPU-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !kernel_arg_addr_space !7 !kernel_arg_access_qual !8 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !10 {		// NOCPU-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META21:![0-9]+]] !kernel_arg_addr_space !8 !kernel_arg_access_qual !9 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !11 {
// NOCPU-NEXT: entry:		// NOCPU-NEXT: entry:
// NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)		// NOCPU-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
// NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// NOCPU-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// NOCPU-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// NOCPU-NEXT: call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])		// NOCPU-NEXT: call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])
// NOCPU-NEXT: ret void		// NOCPU-NEXT: ret void
//		//
//		//
▲ Show 20 Lines • Show All 46 Lines • ▼ Show 20 Lines
// GFX900-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)		// GFX900-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5)
// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)		// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5)
// GFX900-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)		// GFX900-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// GFX900-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)		// GFX900-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13:![0-9]+]]		// GFX900-NEXT: store i8 [[B]], ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13:![0-9]+]]
// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: store i64 [[D]], ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8:[0-9]+]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7:[0-9]+]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14:![0-9]+]]		// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14:![0-9]+]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16:![0-9]+]]		// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16:![0-9]+]]
// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18:![0-9]+]]		// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18:![0-9]+]]
// GFX900-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0		// GFX900-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
// GFX900-NEXT: store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8		// GFX900-NEXT: store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1		// GFX900-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4		// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
// GFX900-NEXT: [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2		// GFX900-NEXT: [[BLOCK_INVOKE:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
// GFX900-NEXT: store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8		// GFX900-NEXT: store ptr @__test_block_invoke, ptr addrspace(5) [[BLOCK_INVOKE]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP2]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 4
// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]		// GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: store i8 [[TMP3]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr		// GFX900-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
// GFX900-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_block_invoke_kernel, ptr [[TMP4]])		// GFX900-NEXT: [[TMP5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[TMP4]])
// GFX900-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]		// GFX900-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]		// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP2]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0		// GFX900-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 0
// GFX900-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8		// GFX900-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1		// GFX900-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 1
// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4		// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
// GFX900-NEXT: [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2		// GFX900-NEXT: [[BLOCK_INVOKE6:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 2
// GFX900-NEXT: store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8		// GFX900-NEXT: store ptr @__test_block_invoke_2, ptr addrspace(5) [[BLOCK_INVOKE6]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 3
// GFX900-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP8:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP8]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6		// GFX900-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 6
// GFX900-NEXT: [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]		// GFX900-NEXT: [[TMP9:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: store i8 [[TMP9]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 4
// GFX900-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP10]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5		// GFX900-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK3]], i32 0, i32 5
// GFX900-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: [[TMP11:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: store i64 [[TMP11]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr		// GFX900-NEXT: [[TMP12:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK3]] to ptr
// GFX900-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr @__test_block_invoke_2_kernel, ptr [[TMP12]])		// GFX900-NEXT: [[TMP13:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP6]], i32 [[TMP7]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[TMP12]])
// GFX900-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]		// GFX900-NEXT: [[TMP14:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: [[TMP15:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]		// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP11]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0		// GFX900-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 0
// GFX900-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8		// GFX900-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE13]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1		// GFX900-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 1
// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4		// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN14]], align 4
// GFX900-NEXT: [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2		// GFX900-NEXT: [[BLOCK_INVOKE15:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 2
// GFX900-NEXT: store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8		// GFX900-NEXT: store ptr @__test_block_invoke_3, ptr addrspace(5) [[BLOCK_INVOKE15]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURED16:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 3
// GFX900-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP16:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[A_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP16]], ptr addrspace(5) [[BLOCK_CAPTURED16]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6		// GFX900-NEXT: [[BLOCK_CAPTURED17:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 6
// GFX900-NEXT: [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]		// GFX900-NEXT: [[TMP17:%.*]] = load i8, ptr addrspace(5) [[B_ADDR]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: store i8 [[TMP17]], ptr addrspace(5) [[BLOCK_CAPTURED17]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURED18:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 4
// GFX900-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP18]], ptr addrspace(5) [[BLOCK_CAPTURED18]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5		// GFX900-NEXT: [[BLOCK_CAPTURED19:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK12]], i32 0, i32 5
// GFX900-NEXT: [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: [[TMP19:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: store i64 [[TMP19]], ptr addrspace(5) [[BLOCK_CAPTURED19]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr		// GFX900-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK12]] to ptr
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0		// GFX900-NEXT: [[TMP21:%.*]] = getelementptr [1 x i64], ptr addrspace(5) [[BLOCK_SIZES]], i32 0, i32 0
// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8		// GFX900-NEXT: store i64 100, ptr addrspace(5) [[TMP21]], align 8
// GFX900-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr @__test_block_invoke_3_kernel, ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])		// GFX900-NEXT: [[TMP22:%.*]] = call i32 @__enqueue_kernel_varargs(ptr addrspace(1) [[TMP14]], i32 [[TMP15]], ptr addrspace(5) [[VARTMP11]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle to ptr), ptr [[TMP20]], i32 1, ptr addrspace(5) [[TMP21]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK_SIZES]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]]
// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0		// GFX900-NEXT: [[BLOCK_SIZE22:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 0
// GFX900-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8		// GFX900-NEXT: store i32 32, ptr addrspace(5) [[BLOCK_SIZE22]], align 8
// GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1		// GFX900-NEXT: [[BLOCK_ALIGN23:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 1
// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4		// GFX900-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN23]], align 4
// GFX900-NEXT: [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2		// GFX900-NEXT: [[BLOCK_INVOKE24:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 2
// GFX900-NEXT: store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8		// GFX900-NEXT: store ptr @__test_block_invoke_4, ptr addrspace(5) [[BLOCK_INVOKE24]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURED25:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 3
// GFX900-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: [[TMP23:%.*]] = load i64, ptr addrspace(5) [[D_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: store i64 [[TMP23]], ptr addrspace(5) [[BLOCK_CAPTURED25]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURED26:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr addrspace(5) [[BLOCK21]], i32 0, i32 4
// GFX900-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[C_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[TMP24]], ptr addrspace(5) [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr		// GFX900-NEXT: [[BLOCK21_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// GFX900-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]		// GFX900-NEXT: [[TMP25:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]		// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[VARTMP27]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: [[TMP27:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr		// GFX900-NEXT: [[TMP28:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK21]] to ptr
// GFX900-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr @__test_block_invoke_4_kernel, ptr [[TMP28]])		// GFX900-NEXT: [[TMP29:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP25]], i32 [[TMP26]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[TMP28]])
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[BLOCK20]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke
// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] {		// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5:[0-9]+]] {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8		// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
// GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA13]]		// GFX900-NEXT: [[TMP0:%.*]] = load i8, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA13]]
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0		// GFX900-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 0
// GFX900-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA13]]		// GFX900-NEXT: store i8 [[TMP0]], ptr addrspace(1) [[ARRAYIDX]], align 1, !tbaa [[TBAA13]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {		// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META19:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)		// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// GFX900-NEXT: call void @__test_block_invoke(ptr [[TMP2]])		// GFX900-NEXT: call void @__test_block_invoke(ptr [[TMP2]])
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
Show All 15 Lines
// GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0		// GFX900-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[TMP3]], i64 0
// GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {		// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META24:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)		// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// GFX900-NEXT: call void @__test_block_invoke_2(ptr [[TMP2]])		// GFX900-NEXT: call void @__test_block_invoke_2(ptr [[TMP2]])
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
Show All 20 Lines
// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(5) [[LP_ADDR]], align 4, !tbaa [[TBAA7]]
// GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0		// GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0
// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_3_kernel
// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.]], ptr addrspace(3) [[TMP1:%.]]) #[[ATTR6]] !kernel_arg_addr_space !23 !kernel_arg_access_qual !24 !kernel_arg_type !25 !kernel_arg_base_type !25 !kernel_arg_type_qual !26 {		// GFX900-SAME: (<{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0:%.]], ptr addrspace(3) [[TMP1:%.]]) #[[ATTR5]] !associated [[META25:![0-9]+]] !kernel_arg_addr_space !26 !kernel_arg_access_qual !27 !kernel_arg_type !28 !kernel_arg_base_type !28 !kernel_arg_type_qual !29 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)		// GFX900-NEXT: [[TMP2:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8		// GFX900-NEXT: store <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[TMP0]], ptr addrspace(5) [[TMP2]], align 8
// GFX900-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr		// GFX900-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(5) [[TMP2]] to ptr
// GFX900-NEXT: call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])		// GFX900-NEXT: call void @__test_block_invoke_3(ptr [[TMP3]], ptr addrspace(3) [[TMP1]])
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4
// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {		// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8		// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3		// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 3
// GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]		// GFX900-NEXT: [[TMP0:%.*]] = load i64, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !tbaa [[TBAA3]]
// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4		// GFX900-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds <{ i32, i32, ptr, i64, ptr addrspace(1) }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 4
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR9:[0-9]+]]		// GFX900-NEXT: call void @callee(i64 noundef [[TMP0]], ptr addrspace(1) noundef [[TMP1]]) #[[ATTR8:[0-9]+]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel		// GFX900-LABEL: define {{[^@]+}}@__test_block_invoke_4_kernel
// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {		// GFX900-SAME: (<{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0:%.*]]) #[[ATTR5]] !associated [[META30:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)		// GFX900-NEXT: [[TMP1:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5)
// GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// GFX900-NEXT: store <{ i32, i32, ptr, i64, ptr addrspace(1) }> [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// GFX900-NEXT: call void @__test_block_invoke_4(ptr [[TMP2]])		// GFX900-NEXT: call void @__test_block_invoke_4(ptr [[TMP2]])
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent norecurse nounwind		// GFX900: Function Attrs: convergent norecurse nounwind
// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel		// GFX900-LABEL: define {{[^@]+}}@test_target_features_kernel
// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space !27 !kernel_arg_access_qual !20 !kernel_arg_type !28 !kernel_arg_base_type !28 !kernel_arg_type_qual !22 {		// GFX900-SAME: (ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space !31 !kernel_arg_access_qual !21 !kernel_arg_type !32 !kernel_arg_base_type !32 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)		// GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)		// GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)		// GFX900-NEXT: [[FLAGS:%.*]] = alloca i32, align 4, addrspace(5)
// GFX900-NEXT: [[NDRANGE:%.]] = alloca [[STRUCT_NDRANGE_T:%.]], align 4, addrspace(5)		// GFX900-NEXT: [[NDRANGE:%.]] = alloca [[STRUCT_NDRANGE_T:%.]], align 4, addrspace(5)
// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)		// GFX900-NEXT: [[TMP:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[TBAA7]]		// GFX900-NEXT: store ptr addrspace(1) [[I]], ptr addrspace(5) [[I_ADDR]], align 8, !tbaa [[TBAA7]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()		// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]		// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA16]]
// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]		// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA14]]
// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]		// GFX900-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[TMP]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT18]]
// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr @__test_target_features_kernel_block_invoke_kernel, ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))		// GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) byval([[STRUCT_NDRANGE_T]]) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr))
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[NDRANGE]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[FLAGS]]) #[[ATTR7]]
// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR8]]		// GFX900-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR7]]
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke		// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke
// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {		// GFX900-SAME: (ptr noundef [[DOTBLOCK_DESCRIPTOR:%.*]]) #[[ATTR5]] {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)		// GFX900-NEXT: [[DOTBLOCK_DESCRIPTOR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8		// GFX900-NEXT: store ptr [[DOTBLOCK_DESCRIPTOR]], ptr addrspace(5) [[DOTBLOCK_DESCRIPTOR_ADDR]], align 8
// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()		// GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime()
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//		//
// GFX900: Function Attrs: convergent nounwind		// GFX900: Function Attrs: convergent nounwind
// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel		// GFX900-LABEL: define {{[^@]+}}@__test_target_features_kernel_block_invoke_kernel
// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR6]] !kernel_arg_addr_space !19 !kernel_arg_access_qual !20 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !22 {		// GFX900-SAME: ({ i32, i32, ptr } [[TMP0:%.*]]) #[[ATTR5]] !associated [[META33:![0-9]+]] !kernel_arg_addr_space !20 !kernel_arg_access_qual !21 !kernel_arg_type !22 !kernel_arg_base_type !22 !kernel_arg_type_qual !23 {
// GFX900-NEXT: entry:		// GFX900-NEXT: entry:
// GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)		// GFX900-NEXT: [[TMP1:%.*]] = alloca { i32, i32, ptr }, align 8, addrspace(5)
// GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8		// GFX900-NEXT: store { i32, i32, ptr } [[TMP0]], ptr addrspace(5) [[TMP1]], align 8
// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr		// GFX900-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[TMP1]] to ptr
// GFX900-NEXT: call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])		// GFX900-NEXT: call void @__test_target_features_kernel_block_invoke(ptr [[TMP2]])
// GFX900-NEXT: ret void		// GFX900-NEXT: ret void
//		//
//.		//.
// NOCPU: attributes #0 = { "objc_arc_inert" }		// NOCPU: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
// NOCPU: attributes #1 = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }		// NOCPU: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// NOCPU: attributes #2 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }		// NOCPU: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
// NOCPU: attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }		// NOCPU: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
// NOCPU: attributes #4 = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }		// NOCPU: attributes #[[ATTR4]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// NOCPU: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }		// NOCPU: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// NOCPU: attributes #6 = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }		// NOCPU: attributes #[[ATTR6]] = { convergent noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+s-memtime-inst" "uniform-work-group-size"="false" }
// NOCPU: attributes #7 = { nocallback nofree nosync nounwind willreturn }		// NOCPU: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
// NOCPU: attributes #8 = { convergent nounwind }		// NOCPU: attributes #[[ATTR8]] = { convergent nounwind }
//.		//.
// GFX900: attributes #0 = { "objc_arc_inert" }		// GFX900: attributes #[[ATTR0:[0-9]+]] = { "objc_arc_inert" }
// GFX900: attributes #1 = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }		// GFX900: attributes #[[ATTR1]] = { convergent norecurse nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
// GFX900: attributes #2 = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }		// GFX900: attributes #[[ATTR2]] = { convergent norecurse nounwind "amdgpu-flat-work-group-size"="1,256" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" "uniform-work-group-size"="false" }
// GFX900: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }		// GFX900: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
// GFX900: attributes #4 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }		// GFX900: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
// GFX900: attributes #5 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }		// GFX900: attributes #[[ATTR5]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }
// GFX900: attributes #6 = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "enqueued-block" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,-sram-ecc" }		// GFX900: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind willreturn }
// GFX900: attributes #7 = { nocallback nofree nosync nounwind willreturn }		// GFX900: attributes #[[ATTR7]] = { nounwind }
// GFX900: attributes #8 = { nounwind }		// GFX900: attributes #[[ATTR8]] = { convergent nounwind }
// GFX900: attributes #9 = { convergent nounwind }
//.		//.
// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}		// NOCPU: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
// NOCPU: !1 = !{i32 1, !"wchar_size", i32 4}		// NOCPU: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// NOCPU: !2 = !{i32 2, i32 0}		// NOCPU: [[META2:![0-9]+]] = !{i32 2, i32 0}
// NOCPU: !3 = !{i32 1, i32 0, i32 1, i32 0}		// NOCPU: [[META3:![0-9]+]] = !{i32 1, i32 0, i32 1, i32 0}
// NOCPU: !4 = !{!"none", !"none", !"none", !"none"}		// NOCPU: [[META4:![0-9]+]] = !{!"none", !"none", !"none", !"none"}
// NOCPU: !5 = !{!"char", !"char", !"long", !"long"}		// NOCPU: [[META5:![0-9]+]] = !{!"char", !"char", !"long", !"long"}
// NOCPU: !6 = !{!"", !"", !"", !""}		// NOCPU: [[META6:![0-9]+]] = !{!"", !"", !"", !""}
// NOCPU: !7 = !{i32 0}		// NOCPU: [[META7]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
// NOCPU: !8 = !{!"none"}		// NOCPU: [[META8:![0-9]+]] = !{i32 0}
// NOCPU: !9 = !{!"__block_literal"}		// NOCPU: [[META9:![0-9]+]] = !{!"none"}
// NOCPU: !10 = !{!""}		// NOCPU: [[META10:![0-9]+]] = !{!"__block_literal"}
// NOCPU: !11 = !{i32 0, i32 3}		// NOCPU: [[META11:![0-9]+]] = !{!""}
// NOCPU: !12 = !{!"none", !"none"}		// NOCPU: [[META12]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
// NOCPU: !13 = !{!"__block_literal", !"void*"}		// NOCPU: [[META13]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
// NOCPU: !14 = !{!"", !""}		// NOCPU: [[META14:![0-9]+]] = !{i32 0, i32 3}
// NOCPU: !15 = !{i32 1}		// NOCPU: [[META15:![0-9]+]] = !{!"none", !"none"}
// NOCPU: !16 = !{!"int*"}		// NOCPU: [[META16:![0-9]+]] = !{!"__block_literal", !"void*"}
		// NOCPU: [[META17:![0-9]+]] = !{!"", !""}
		// NOCPU: [[META18]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle}
		// NOCPU: [[META19:![0-9]+]] = !{i32 1}
		// NOCPU: [[META20:![0-9]+]] = !{!"int*"}
		// NOCPU: [[META21]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle}
//.		//.
// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}		// GFX900: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
// GFX900: !1 = !{i32 1, !"wchar_size", i32 4}		// GFX900: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
// GFX900: !2 = !{i32 2, i32 0}		// GFX900: [[META2:![0-9]+]] = !{i32 2, i32 0}
// GFX900: !3 = !{!4, !4, i64 0}		// GFX900: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
// GFX900: !4 = !{!"long", !5, i64 0}		// GFX900: [[META4]] = !{!"long", [[META5:![0-9]+]], i64 0}
// GFX900: !5 = !{!"omnipotent char", !6, i64 0}		// GFX900: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
// GFX900: !6 = !{!"Simple C/C++ TBAA"}		// GFX900: [[META6]] = !{!"Simple C/C++ TBAA"}
// GFX900: !7 = !{!8, !8, i64 0}		// GFX900: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
// GFX900: !8 = !{!"any pointer", !5, i64 0}		// GFX900: [[META8]] = !{!"any pointer", [[META5]], i64 0}
// GFX900: !9 = !{i32 1, i32 0, i32 1, i32 0}		// GFX900: [[META9:![0-9]+]] = !{i32 1, i32 0, i32 1, i32 0}
// GFX900: !10 = !{!"none", !"none", !"none", !"none"}		// GFX900: [[META10:![0-9]+]] = !{!"none", !"none", !"none", !"none"}
// GFX900: !11 = !{!"char", !"char", !"long", !"long"}		// GFX900: [[META11:![0-9]+]] = !{!"char", !"char", !"long", !"long"}
// GFX900: !12 = !{!"", !"", !"", !""}		// GFX900: [[META12:![0-9]+]] = !{!"", !"", !"", !""}
// GFX900: !13 = !{!5, !5, i64 0}		// GFX900: [[TBAA13]] = !{[[META5]], [[META5]], i64 0}
// GFX900: !14 = !{!15, !15, i64 0}		// GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0}
// GFX900: !15 = !{!"int", !5, i64 0}		// GFX900: [[META15]] = !{!"int", [[META5]], i64 0}
// GFX900: !16 = !{!17, !17, i64 0}		// GFX900: [[TBAA16]] = !{[[META17:![0-9]+]], [[META17]], i64 0}
// GFX900: !17 = !{!"queue_t", !5, i64 0}		// GFX900: [[META17]] = !{!"queue_t", [[META5]], i64 0}
// GFX900: !18 = !{i64 0, i64 4, !14}		// GFX900: [[TBAA_STRUCT18]] = !{i64 0, i64 4, [[TBAA14]]}
// GFX900: !19 = !{i32 0}		// GFX900: [[META19]] = !{ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle}
// GFX900: !20 = !{!"none"}		// GFX900: [[META20:![0-9]+]] = !{i32 0}
// GFX900: !21 = !{!"__block_literal"}		// GFX900: [[META21:![0-9]+]] = !{!"none"}
// GFX900: !22 = !{!""}		// GFX900: [[META22:![0-9]+]] = !{!"__block_literal"}
// GFX900: !23 = !{i32 0, i32 3}		// GFX900: [[META23:![0-9]+]] = !{!""}
// GFX900: !24 = !{!"none", !"none"}		// GFX900: [[META24]] = !{ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle}
// GFX900: !25 = !{!"__block_literal", !"void*"}		// GFX900: [[META25]] = !{ptr addrspace(1) @__test_block_invoke_3_kernel.runtime.handle}
// GFX900: !26 = !{!"", !""}		// GFX900: [[META26:![0-9]+]] = !{i32 0, i32 3}
// GFX900: !27 = !{i32 1}		// GFX900: [[META27:![0-9]+]] = !{!"none", !"none"}
// GFX900: !28 = !{!"int*"}		// GFX900: [[META28:![0-9]+]] = !{!"__block_literal", !"void*"}
		// GFX900: [[META29:![0-9]+]] = !{!"", !""}
		// GFX900: [[META30]] = !{ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle}
		// GFX900: [[META31:![0-9]+]] = !{i32 1}
		// GFX900: [[META32:![0-9]+]] = !{!"int*"}
		// GFX900: [[META33]] = !{ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle}
//.		//.
//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:		//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
// CHECK: {{.*}}		// CHECK: {{.*}}

llvm/docs/AMDGPUUsage.rst

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,727 Lines • ▼ Show 20 Lines	``.rela``\ name, ``.rela.dyn``
See :ref:`amdgpu-relocation-records` for the relocation records supported by		See :ref:`amdgpu-relocation-records` for the relocation records supported by
the AMDGPU backend.		the AMDGPU backend.

``.text``		``.text``
The executable machine code for the kernels and functions they call. Generated		The executable machine code for the kernels and functions they call. Generated
as position independent code. See :ref:`amdgpu-code-conventions` for		as position independent code. See :ref:`amdgpu-code-conventions` for
information on conventions used in the isa generation.		information on conventions used in the isa generation.

		``.amdgpu.kernel.runtime.handle``
		Symbols used for device enqueue.

.. _amdgpu-note-records:		.. _amdgpu-note-records:

Note Records		Note Records
------------		------------

The AMDGPU backend code object contains ELF note records in the ``.note``		The AMDGPU backend code object contains ELF note records in the ``.note``
section. The set of generated notes and their semantics depend on the code		section. The set of generated notes and their semantics depend on the code
object version; see :ref:`amdgpu-note-records-v2` and		object version; see :ref:`amdgpu-note-records-v2` and
▲ Show 20 Lines • Show All 13,919 Lines • Show Last 20 Lines

llvm/lib/IR/AutoUpgrade.cpp

Show All 35 Lines
#include "llvm/IR/LLVMContext.h"		#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"		#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/Verifier.h"		#include "llvm/IR/Verifier.h"
#include "llvm/Support/CommandLine.h"		#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"		#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Regex.h"		#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"		#include "llvm/TargetParser/Triple.h"
		#include "llvm/Transforms/Utils/ModuleUtils.h"

#include <cstring>		#include <cstring>

using namespace llvm;		using namespace llvm;

static cl::opt<bool>		static cl::opt<bool>
DisableAutoUpgradeDebugInfo("disable-auto-upgrade-debug-info",		DisableAutoUpgradeDebugInfo("disable-auto-upgrade-debug-info",
cl::desc("Disable autoupgrade of debug info"));		cl::desc("Disable autoupgrade of debug info"));

▲ Show 20 Lines • Show All 5,019 Lines • ▼ Show 20 Lines	void visitCallBase(CallBase &Call) {
// If we get here, the caller doesn't have the strictfp attribute		// If we get here, the caller doesn't have the strictfp attribute
// but this callsite does. Replace the strictfp attribute with nobuiltin.		// but this callsite does. Replace the strictfp attribute with nobuiltin.
Call.removeFnAttr(Attribute::StrictFP);		Call.removeFnAttr(Attribute::StrictFP);
Call.addFnAttr(Attribute::NoBuiltin);		Call.addFnAttr(Attribute::NoBuiltin);
}		}
};		};
} // namespace		} // namespace

		static StructType *getAMDGPURuntimeHandleType(LLVMContext &C,
		Type *KernelDescriptorPtrTy) {
		Type *Int32 = Type::getInt32Ty(C);
		return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
		"block.runtime.handle.t");
		}

		/// Rewrite to new scheme for enqueued block lowering
		static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) {
		if (F.isMaterializable()) {
		// A verifier error is produced if we add metadata to the function during
		// linking.
		return;
		}

		const StringLiteral EnqueuedBlockName("enqueued-block");
		if (!F.hasFnAttribute(EnqueuedBlockName))
		return;

		F.removeFnAttr(EnqueuedBlockName);

		Module *M = F.getParent();
		LLVMContext &Ctx = M->getContext();
		const DataLayout &DL = M->getDataLayout();

		StructType *HandleTy = getAMDGPURuntimeHandleType(
		Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace()));

		Twine RuntimeHandleName = F.getName() + ".runtime.handle";

		auto *RuntimeHandle = new GlobalVariable(
		*M, HandleTy,
		/isConstant=/true, F.getLinkage(),
		/Initializer=/ConstantAggregateZero::get(HandleTy), RuntimeHandleName,
		/InsertBefore=/nullptr, GlobalValue::NotThreadLocal,
		DL.getDefaultGlobalsAddressSpace(),
		/isExternallyInitialized=/true);
		RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");

		MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle));
		F.setMetadata(LLVMContext::MD_associated, HandleAsMD);

		appendToUsed(*M, {&F, RuntimeHandle});
		}

void llvm::UpgradeFunctionAttributes(Function &F) {		void llvm::UpgradeFunctionAttributes(Function &F) {
// If a function definition doesn't have the strictfp attribute,		// If a function definition doesn't have the strictfp attribute,
// convert any callsite strictfp attributes to nobuiltin.		// convert any callsite strictfp attributes to nobuiltin.
if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) {		if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) {
StrictFPUpgradeVisitor SFPV;		StrictFPUpgradeVisitor SFPV;
SFPV.visit(F);		SFPV.visit(F);
}		}

// Remove all incompatibile attributes from function.		// Remove all incompatibile attributes from function.
F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));		F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
for (auto &Arg : F.args())		for (auto &Arg : F.args())
Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));		Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));

		if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
		upgradeAMDGPUKernelEnqueuedBlock(F);
}		}

static bool isOldLoopArgument(Metadata *MD) {		static bool isOldLoopArgument(Metadata *MD) {
auto *T = dyn_cast_or_null<MDTuple>(MD);		auto *T = dyn_cast_or_null<MDTuple>(MD);
if (!T)		if (!T)
return false;		return false;
if (T->getNumOperands() < 1)		if (T->getNumOperands() < 1)
return false;		return false;
▲ Show 20 Lines • Show All 178 Lines • Show Last 20 Lines

llvm/lib/IR/CMakeLists.txt

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	add_llvm_component_library(LLVMCore
${LLVM_PTHREAD_LIB}		${LLVM_PTHREAD_LIB}

DEPENDS		DEPENDS
intrinsics_gen		intrinsics_gen

LINK_COMPONENTS		LINK_COMPONENTS
BinaryFormat		BinaryFormat
Demangle		Demangle
		TransformUtils
		arsenmAuthorUnsubmitted Done Reply Inline Actions This introduces a circular dependency between LLVMCore and TransformUtils. Options are: Move appendToUsed into Module Don't bother with bitcode compatibility for this Avoid depending on llvm.used. I know I tried to do this but it was so long ago I don't remember how I ended up on this solution arsenm: This introduces a circular dependency between LLVMCore and TransformUtils. Options are: 1.
Remarks		Remarks
Support		Support
TargetParser		TargetParser
)		)

llvm/lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 354 Lines • ▼ Show 20 Lines

	ImmutablePass *createAMDGPUAAWrapperPass();			ImmutablePass *createAMDGPUAAWrapperPass();
	void initializeAMDGPUAAWrapperPassPass(PassRegistry&);			void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
	ImmutablePass *createAMDGPUExternalAAWrapperPass();			ImmutablePass *createAMDGPUExternalAAWrapperPass();
	void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);			void initializeAMDGPUExternalAAWrapperPass(PassRegistry&);

	void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);			void initializeAMDGPUArgumentUsageInfoPass(PassRegistry &);

	ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass();			ModulePass *createAMDGPUExportKernelRuntimeHandlesPass();
	void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);			void initializeAMDGPUExportKernelRuntimeHandlesPass(PassRegistry &);
	extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;			extern char &AMDGPUExportKernelRuntimeHandlesID;

	void initializeGCNNSAReassignPass(PassRegistry &);			void initializeGCNNSAReassignPass(PassRegistry &);
	extern char &GCNNSAReassignID;			extern char &GCNNSAReassignID;

	void initializeGCNPreRALongBranchRegPass(PassRegistry &);			void initializeGCNPreRALongBranchRegPass(PassRegistry &);
	extern char &GCNPreRALongBranchRegID;			extern char &GCNPreRALongBranchRegID;

	void initializeGCNPreRAOptimizationsPass(PassRegistry &);			void initializeGCNPreRAOptimizationsPass(PassRegistry &);
	▲ Show 20 Lines • Show All 127 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp

This file was added.

				//===- AMDGPUExportKernelRuntimeHandles.cpp - Lower enqueued block --------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// \file
				//
				// Give any globals used for OpenCL block enqueue runtime handles external
				// linkage so the runtime may access them. These should behave like internal
				// functions for purposes of linking, but need to have an external symbol in the
				// final object for the runtime to access them.
				//
				// TODO: This could be replaced with a new linkage type or global object
				// metadata that produces an external symbol in the final object, but allows
				// rename on IR linking. Alternatively if we can rely on
				// GlobalValue::getGlobalIdentifier we can just make these external symbols to
				// begin with.
				//
				//===----------------------------------------------------------------------===//

				#include "AMDGPU.h"
				#include "llvm/IR/Module.h"
				#include "llvm/Pass.h"

				#define DEBUG_TYPE "amdgpu-export-kernel-runtime-handles"

				using namespace llvm;

				namespace {

				/// Lower enqueued blocks.
				class AMDGPUExportKernelRuntimeHandles : public ModulePass {
				public:
				static char ID;

				explicit AMDGPUExportKernelRuntimeHandles() : ModulePass(ID) {}

				private:
				bool runOnModule(Module &M) override;
				};

				} // end anonymous namespace

				char AMDGPUExportKernelRuntimeHandles::ID = 0;

				char &llvm::AMDGPUExportKernelRuntimeHandlesID =
				AMDGPUExportKernelRuntimeHandles::ID;

				INITIALIZE_PASS(AMDGPUExportKernelRuntimeHandles, DEBUG_TYPE,
				"Externalize enqueued block runtime handles", false, false)

				ModulePass *llvm::createAMDGPUExportKernelRuntimeHandlesPass() {
				return new AMDGPUExportKernelRuntimeHandles();
				}

				bool AMDGPUExportKernelRuntimeHandles::runOnModule(Module &M) {
				bool Changed = false;

				const StringLiteral HandleSectionName(".amdgpu.kernel.runtime.handle");

				for (GlobalVariable &GV : M.globals()) {
				if (GV.getSection() == HandleSectionName) {
				GV.setLinkage(GlobalValue::ExternalLinkage);
				GV.setDSOLocal(false);
				Changed = true;
				}
				}

				if (!Changed)
				return false;

				// FIXME: We shouldn't really need to export the kernel address. We can
				// initialize the runtime handle with the kernel descriptor
				for (Function &F : M) {
				if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)
				continue;

				const MDNode *Associated = F.getMetadata(LLVMContext::MD_associated);
				if (!Associated)
				continue;

				auto *VM = cast<ValueAsMetadata>(Associated->getOperand(0));
				auto *Handle = dyn_cast<GlobalObject>(VM->getValue());
				if (Handle && Handle->getSection() == HandleSectionName) {
				F.setLinkage(GlobalValue::ExternalLinkage);
				F.setVisibility(GlobalValue::ProtectedVisibility);
				}
				}

				return Changed;
				}

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h

Show All 15 Lines
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H		#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H

#include "llvm/BinaryFormat/MsgPackDocument.h"		#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/Support/AMDGPUMetadata.h"		#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/Alignment.h"		#include "llvm/Support/Alignment.h"

namespace llvm {		namespace llvm {

		class AMDGPUTargetMachine;
class AMDGPUTargetStreamer;		class AMDGPUTargetStreamer;
class Argument;		class Argument;
class DataLayout;		class DataLayout;
class Function;		class Function;
class MachineFunction;		class MachineFunction;
class MDNode;		class MDNode;
class Module;		class Module;
struct SIProgramInfo;		struct SIProgramInfo;
Show All 21 Lines	public:

virtual void emitKernel(const MachineFunction &MF,		virtual void emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) = 0;		const SIProgramInfo &ProgramInfo) = 0;

protected:		protected:
virtual void emitVersion() = 0;		virtual void emitVersion() = 0;
virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,		virtual void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) = 0;		msgpack::ArrayDocNode Args) = 0;
virtual void emitKernelAttrs(const Function &Func,		virtual void emitKernelAttrs(const AMDGPUTargetMachine &TM,
		const Function &Func,
msgpack::MapDocNode Kern) = 0;		msgpack::MapDocNode Kern) = 0;
};		};

class MetadataStreamerMsgPackV4 : public MetadataStreamer {		class MetadataStreamerMsgPackV4 : public MetadataStreamer {
protected:		protected:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =		std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();		std::make_unique<msgpack::Document>();

Show All 20 Lines	protected:
void emitVersion() override;		void emitVersion() override;

void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);		void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);

void emitPrintf(const Module &Mod);		void emitPrintf(const Module &Mod);

void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);		void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);

void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override;		void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
		msgpack::MapDocNode Kern) override;

void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);		void emitKernelArgs(const MachineFunction &MF, msgpack::MapDocNode Kern);

void emitKernelArg(const Argument &Arg, unsigned &Offset,		void emitKernelArg(const Argument &Arg, unsigned &Offset,
msgpack::ArrayDocNode Args);		msgpack::ArrayDocNode Args);

void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,		void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
StringRef ValueKind, unsigned &Offset,		StringRef ValueKind, unsigned &Offset,
Show All 29 Lines	void emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) override;		const SIProgramInfo &ProgramInfo) override;
};		};

class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 {		class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 {
protected:		protected:
void emitVersion() override;		void emitVersion() override;
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,		void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) override;		msgpack::ArrayDocNode Args) override;
void emitKernelAttrs(const Function &Func, msgpack::MapDocNode Kern) override;		void emitKernelAttrs(const AMDGPUTargetMachine &TM, const Function &Func,
		msgpack::MapDocNode Kern) override;

public:		public:
MetadataStreamerMsgPackV5() = default;		MetadataStreamerMsgPackV5() = default;
~MetadataStreamerMsgPackV5() = default;		~MetadataStreamerMsgPackV5() = default;
};		};

} // end namespace HSAMD		} // end namespace HSAMD
} // end namespace AMDGPU		} // end namespace AMDGPU
} // end namespace llvm		} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H		#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUHSAMETADATASTREAMER_H

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

Show All 13 Lines

#include "AMDGPUHSAMetadataStreamer.h"		#include "AMDGPUHSAMetadataStreamer.h"
#include "AMDGPU.h"		#include "AMDGPU.h"
#include "GCNSubtarget.h"		#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUTargetStreamer.h"		#include "MCTargetDesc/AMDGPUTargetStreamer.h"
#include "SIMachineFunctionInfo.h"		#include "SIMachineFunctionInfo.h"
#include "SIProgramInfo.h"		#include "SIProgramInfo.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
		#include "llvm/Target/TargetLoweringObjectFile.h"

using namespace llvm;		using namespace llvm;

static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,		static std::pair<Type *, Align> getArgumentTypeAlign(const Argument &Arg,
const DataLayout &DL) {		const DataLayout &DL) {
Type *Ty = Arg.getType();		Type *Ty = Arg.getType();
MaybeAlign ArgAlign;		MaybeAlign ArgAlign;
if (Arg.hasByRefAttr()) {		if (Arg.hasByRefAttr()) {
Ty = Arg.getParamByRefType();		Ty = Arg.getParamByRefType();
ArgAlign = Arg.getParamAlign();		ArgAlign = Arg.getParamAlign();
}		}

if (!ArgAlign)		if (!ArgAlign)
ArgAlign = DL.getABITypeAlign(Ty);		ArgAlign = DL.getABITypeAlign(Ty);

return std::pair(Ty, *ArgAlign);		return std::pair(Ty, *ArgAlign);
}		}

		/// Find the mangled symbol name for the runtime handle for \p EnqueuedBlock
		static std::string getEnqueuedBlockSymbolName(const AMDGPUTargetMachine &TM,
		const Function &EnqueuedBlock) {
		const MDNode *Associated =
		EnqueuedBlock.getMetadata(LLVMContext::MD_associated);
		if (!Associated)
		return "";

		auto *VM = cast<ValueAsMetadata>(Associated->getOperand(0));
		auto *RuntimeHandle =
		dyn_cast<GlobalVariable>(VM->getValue()->stripPointerCasts());
		if (!RuntimeHandle \|\|
		RuntimeHandle->getSection() != ".amdgpu.kernel.runtime.handle")
		return "";

		SmallString<128> Name;
		TM.getNameWithPrefix(Name, RuntimeHandle,
		TM.getObjFileLowering()->getMangler());
		return Name.str().str();
		}

namespace llvm {		namespace llvm {

static cl::opt<bool> DumpHSAMetadata(		static cl::opt<bool> DumpHSAMetadata(
"amdgpu-dump-hsa-metadata",		"amdgpu-dump-hsa-metadata",
cl::desc("Dump AMDGPU HSA Metadata"));		cl::desc("Dump AMDGPU HSA Metadata"));
static cl::opt<bool> VerifyHSAMetadata(		static cl::opt<bool> VerifyHSAMetadata(
"amdgpu-verify-hsa-metadata",		"amdgpu-verify-hsa-metadata",
cl::desc("Verify AMDGPU HSA Metadata"));		cl::desc("Verify AMDGPU HSA Metadata"));

namespace AMDGPU {		namespace AMDGPU {
namespace HSAMD {		namespace HSAMD {

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// HSAMetadataStreamerV4		// HSAMetadataStreamerV4
		kzhuravlUnsubmitted Not Done Reply Inline Actions Do we really need/want to update code object v2? kzhuravl: Do we really need/want to update code object v2?
		arsenmAuthorUnsubmitted Done Reply Inline Actions as long as the code is here yes. Not updating it would mean maintaining two paths in the implementation. This is just changing the internal representation arsenm: as long as the code is here yes. Not updating it would mean maintaining two paths in the…
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {		void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';		errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
}		}

void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {		void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";		errs() << "AMDGPU HSA Metadata Parser Test: ";
▲ Show 20 Lines • Show All 163 Lines • ▼ Show 20 Lines	void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
auto LanguageVersion = Kern.getDocument()->getArrayNode();		auto LanguageVersion = Kern.getDocument()->getArrayNode();
LanguageVersion.push_back(Kern.getDocument()->getNode(		LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));		mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue()));
LanguageVersion.push_back(Kern.getDocument()->getNode(		LanguageVersion.push_back(Kern.getDocument()->getNode(
mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));		mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue()));
Kern[".language_version"] = LanguageVersion;		Kern[".language_version"] = LanguageVersion;
}		}

void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,		void MetadataStreamerMsgPackV4::emitKernelAttrs(const AMDGPUTargetMachine &TM,
		const Function &Func,
msgpack::MapDocNode Kern) {		msgpack::MapDocNode Kern) {

if (auto Node = Func.getMetadata("reqd_work_group_size"))		if (auto Node = Func.getMetadata("reqd_work_group_size"))
Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);		Kern[".reqd_workgroup_size"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("work_group_size_hint"))		if (auto Node = Func.getMetadata("work_group_size_hint"))
Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);		Kern[".workgroup_size_hint"] = getWorkGroupDimensions(Node);
if (auto Node = Func.getMetadata("vec_type_hint")) {		if (auto Node = Func.getMetadata("vec_type_hint")) {
Kern[".vec_type_hint"] = Kern.getDocument()->getNode(		Kern[".vec_type_hint"] = Kern.getDocument()->getNode(
getTypeName(		getTypeName(
cast<ValueAsMetadata>(Node->getOperand(0))->getType(),		cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),		mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue()),
/Copy=/true);		/Copy=/true);
}		}
if (Func.hasFnAttribute("runtime-handle")) {
Kern[".device_enqueue_symbol"] = Kern.getDocument()->getNode(		std::string HandleName = getEnqueuedBlockSymbolName(TM, Func);
Func.getFnAttribute("runtime-handle").getValueAsString().str(),		if (!HandleName.empty()) {
/Copy=/true);		Kern[".device_enqueue_symbol"] =
		Kern.getDocument()->getNode(std::move(HandleName), /Copy=/true);
}		}

if (Func.hasFnAttribute("device-init"))		if (Func.hasFnAttribute("device-init"))
Kern[".kind"] = Kern.getDocument()->getNode("init");		Kern[".kind"] = Kern.getDocument()->getNode("init");
else if (Func.hasFnAttribute("device-fini"))		else if (Func.hasFnAttribute("device-fini"))
Kern[".kind"] = Kern.getDocument()->getNode("fini");		Kern[".kind"] = Kern.getDocument()->getNode("fini");
}		}

void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,		void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
msgpack::MapDocNode Kern) {		msgpack::MapDocNode Kern) {
▲ Show 20 Lines • Show All 268 Lines • ▼ Show 20 Lines

void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,		void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {		const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();		auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&		if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
Func.getCallingConv() != CallingConv::SPIR_KERNEL)		Func.getCallingConv() != CallingConv::SPIR_KERNEL)
return;		return;

		const auto &TM = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());

auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());		auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());
auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);		auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);

auto Kernels =		auto Kernels =
getRootMetadata("amdhsa.kernels").getArray(/Convert=/true);		getRootMetadata("amdhsa.kernels").getArray(/Convert=/true);

{		{
Kern[".name"] = Kern.getDocument()->getNode(Func.getName());		Kern[".name"] = Kern.getDocument()->getNode(Func.getName());
Kern[".symbol"] = Kern.getDocument()->getNode(		Kern[".symbol"] = Kern.getDocument()->getNode(
(Twine(Func.getName()) + Twine(".kd")).str(), /Copy=/true);		(Twine(Func.getName()) + Twine(".kd")).str(), /Copy=/true);
emitKernelLanguage(Func, Kern);		emitKernelLanguage(Func, Kern);
emitKernelAttrs(Func, Kern);		emitKernelAttrs(TM, Func, Kern);
emitKernelArgs(MF, Kern);		emitKernelArgs(MF, Kern);
}		}

Kernels.push_back(Kern);		Kernels.push_back(Kern);
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// HSAMetadataStreamerV5		// HSAMetadataStreamerV5
▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines	void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
} else {		} else {
Offset += 8; // Skipped.		Offset += 8; // Skipped.
}		}

if (MFI.getUserSGPRInfo().hasQueuePtr())		if (MFI.getUserSGPRInfo().hasQueuePtr())
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);		emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
}		}

void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func,		void MetadataStreamerMsgPackV5::emitKernelAttrs(const AMDGPUTargetMachine &TM,
		const Function &Func,
msgpack::MapDocNode Kern) {		msgpack::MapDocNode Kern) {
MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern);		MetadataStreamerMsgPackV4::emitKernelAttrs(TM, Func, Kern);

if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())		if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);		Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
}		}


} // end namespace HSAMD		} // end namespace HSAMD
} // end namespace AMDGPU		} // end namespace AMDGPU
} // end namespace llvm		} // end namespace llvm

llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp

This file was deleted.

	//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===//
	//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// \file
	// This post-linking pass replaces the function pointer of enqueued
	// block kernel with a global variable (runtime handle) and adds
	// "runtime-handle" attribute to the enqueued block kernel.
	//
	// In LLVM CodeGen the runtime-handle metadata will be translated to
	// RuntimeHandle metadata in code object. Runtime allocates a global buffer
	// for each kernel with RuntimeHandle metadata and saves the kernel address
	// required for the AQL packet into the buffer. __enqueue_kernel function
	// in device library knows that the invoke function pointer in the block
	// literal is actually runtime handle and loads the kernel address from it
	// and put it into AQL packet for dispatching.
	//
	// This cannot be done in FE since FE cannot create a unique global variable
	// with external linkage across LLVM modules. The global variable with internal
	// linkage does not work since optimization passes will try to replace loads
	// of the global variable with its initialization value.
	//
	// It also identifies the kernels directly or indirectly enqueues kernels
	// and adds "calls-enqueue-kernel" function attribute to them, which will
	// be used to determine whether to emit runtime metadata for the kernel
	// enqueue related hidden kernel arguments.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPU.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"

	#define DEBUG_TYPE "amdgpu-lower-enqueued-block"

	using namespace llvm;

	namespace {

	/// Lower enqueued blocks.
	class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass {
	public:
	static char ID;

	explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {}

	private:
	bool runOnModule(Module &M) override;
	};

	} // end anonymous namespace

	char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0;

	char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID =
	AMDGPUOpenCLEnqueuedBlockLowering::ID;

	INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, DEBUG_TYPE,
	"Lower OpenCL enqueued blocks", false, false)

	ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
	return new AMDGPUOpenCLEnqueuedBlockLowering();
	}

	bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
	DenseSet<Function *> Callers;
	auto &C = M.getContext();
	bool Changed = false;

	// ptr kernel_object, i32 private_segment_size, i32 group_segment_size
	StructType *HandleTy = nullptr;

	for (auto &F : M.functions()) {
	if (F.hasFnAttribute("enqueued-block")) {
	if (!F.hasName()) {
	SmallString<64> Name;
	Mangler::getNameWithPrefix(Name, "__amdgpu_enqueued_kernel",
	M.getDataLayout());
	F.setName(Name);
	}
	LLVM_DEBUG(dbgs() << "found enqueued kernel: " << F.getName() << '\n');
	auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
	if (!HandleTy) {
	Type *Int32 = Type::getInt32Ty(C);
	HandleTy =
	StructType::create(C, {PointerType::getUnqual(C), Int32, Int32},
	"block.runtime.handle.t");
	}

	auto *GV = new GlobalVariable(
	M, HandleTy,
	/isConstant=/true, GlobalValue::ExternalLinkage,
	/Initializer=/Constant::getNullValue(HandleTy), RuntimeHandle,
	/InsertBefore=/nullptr, GlobalValue::NotThreadLocal,
	AMDGPUAS::GLOBAL_ADDRESS,
	/isExternallyInitialized=/true);
	LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');

	F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType()));
	F.addFnAttr("runtime-handle", RuntimeHandle);
	F.setLinkage(GlobalValue::ExternalLinkage);
	Changed = true;
	}
	}

	return Changed;
	}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 383 Lines • ▼ Show 20 Lines	extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAttributorLegacyPass(*PR);		initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);		initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);		initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);		initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);		initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);		initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);		initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);		initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);		initializeAMDGPUExportKernelRuntimeHandlesPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);		initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);		initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPURegBankCombinerPass(*PR);		initializeAMDGPURegBankCombinerPass(*PR);
initializeAMDGPURegBankSelectPass(*PR);		initializeAMDGPURegBankSelectPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);		initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);		initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);		initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);		initializeAMDGPULateCodeGenPreparePass(*PR);
▲ Show 20 Lines • Show All 608 Lines • ▼ Show 20 Lines	void AMDGPUPassConfig::addIRPasses() {
// Function calls are not supported, so make sure we inline everything.		// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());		addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());		addPass(createAlwaysInlinerLegacyPass());

// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.		// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
if (Arch == Triple::r600)		if (Arch == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());		addPass(createR600OpenCLImageTypeLoweringPass());

// Replace OpenCL enqueued block function pointers with global variables.		// Make enqueued block runtime handles externally visible.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());		addPass(createAMDGPUExportKernelRuntimeHandlesPass());

// Runs before PromoteAlloca so the latter can account for function uses		// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS) {		if (EnableLowerModuleLDS) {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));		addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}		}

// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run		// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
// after their introduction		// after their introduction
▲ Show 20 Lines • Show All 645 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/CMakeLists.txt

Show First 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	add_llvm_target(AMDGPUCodeGen
AMDGPUAsmPrinter.cpp		AMDGPUAsmPrinter.cpp
AMDGPUAtomicOptimizer.cpp		AMDGPUAtomicOptimizer.cpp
AMDGPUAttributor.cpp		AMDGPUAttributor.cpp
AMDGPUCallLowering.cpp		AMDGPUCallLowering.cpp
AMDGPUCodeGenPrepare.cpp		AMDGPUCodeGenPrepare.cpp
AMDGPUCombinerHelper.cpp		AMDGPUCombinerHelper.cpp
AMDGPUCtorDtorLowering.cpp		AMDGPUCtorDtorLowering.cpp
AMDGPUExportClustering.cpp		AMDGPUExportClustering.cpp
		AMDGPUExportKernelRuntimeHandles.cpp
AMDGPUFrameLowering.cpp		AMDGPUFrameLowering.cpp
AMDGPUGlobalISelUtils.cpp		AMDGPUGlobalISelUtils.cpp
AMDGPUHSAMetadataStreamer.cpp		AMDGPUHSAMetadataStreamer.cpp
AMDGPUInsertDelayAlu.cpp		AMDGPUInsertDelayAlu.cpp
AMDGPUInstCombineIntrinsic.cpp		AMDGPUInstCombineIntrinsic.cpp
AMDGPUInstrInfo.cpp		AMDGPUInstrInfo.cpp
AMDGPUInstructionSelector.cpp		AMDGPUInstructionSelector.cpp
AMDGPUISelDAGToDAG.cpp		AMDGPUISelDAGToDAG.cpp
AMDGPUISelLowering.cpp		AMDGPUISelLowering.cpp
AMDGPULateCodeGenPrepare.cpp		AMDGPULateCodeGenPrepare.cpp
AMDGPULegalizerInfo.cpp		AMDGPULegalizerInfo.cpp
AMDGPULibCalls.cpp		AMDGPULibCalls.cpp
AMDGPUImageIntrinsicOptimizer.cpp		AMDGPUImageIntrinsicOptimizer.cpp
AMDGPULibFunc.cpp		AMDGPULibFunc.cpp
AMDGPULowerKernelArguments.cpp		AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp		AMDGPULowerKernelAttributes.cpp
AMDGPULowerModuleLDSPass.cpp		AMDGPULowerModuleLDSPass.cpp
AMDGPUMachineCFGStructurizer.cpp		AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp		AMDGPUMachineFunction.cpp
AMDGPUMachineModuleInfo.cpp		AMDGPUMachineModuleInfo.cpp
AMDGPUMacroFusion.cpp		AMDGPUMacroFusion.cpp
AMDGPUMCInstLower.cpp		AMDGPUMCInstLower.cpp
AMDGPUIGroupLP.cpp		AMDGPUIGroupLP.cpp
AMDGPUMIRFormatter.cpp		AMDGPUMIRFormatter.cpp
AMDGPUOpenCLEnqueuedBlockLowering.cpp
AMDGPUPerfHintAnalysis.cpp		AMDGPUPerfHintAnalysis.cpp
AMDGPUPostLegalizerCombiner.cpp		AMDGPUPostLegalizerCombiner.cpp
AMDGPUPreLegalizerCombiner.cpp		AMDGPUPreLegalizerCombiner.cpp
AMDGPUPrintfRuntimeBinding.cpp		AMDGPUPrintfRuntimeBinding.cpp
AMDGPUPromoteAlloca.cpp		AMDGPUPromoteAlloca.cpp
AMDGPUPromoteKernelArguments.cpp		AMDGPUPromoteKernelArguments.cpp
AMDGPURegBankCombiner.cpp		AMDGPURegBankCombiner.cpp
AMDGPURegBankSelect.cpp		AMDGPURegBankSelect.cpp
▲ Show 20 Lines • Show All 112 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals
				; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-export-kernel-runtime-handles < %s \| FileCheck %s

				sameerdsUnsubmitted Not Done Reply Inline Actions Is there any visible effect of the pass being tested? Or the intention is simply to check that the output is the same as input, and there is no error? sameerds: Is there any visible effect of the pass being tested? Or the intention is simply to check that…
				%block.runtime.handle.t = type { ptr addrspace(1), i32, i32 }

				; associated globals without the correct section should be ignored.
				@block.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
				@not.a.block.handle = internal addrspace(1) externally_initialized constant %block.runtime.handle.t zeroinitializer

				;.
				; CHECK: @[[BLOCK_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer, section ".amdgpu.kernel.runtime.handle"
				; CHECK: @[[NOT_A_BLOCK_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
				;.
				define internal amdgpu_kernel void @block_kernel() !associated !0 {
				; CHECK-LABEL: define {{[^@]+}}@block_kernel() !associated !0 {
				; CHECK-NEXT: ret void
				;
				ret void
				}

				define internal dso_local amdgpu_kernel void @dso_local_block_kernel() !associated !0 {
				; CHECK-LABEL: define {{[^@]+}}@dso_local_block_kernel() !associated !0 {
				; CHECK-NEXT: ret void
				;
				ret void
				}

				define internal amdgpu_kernel void @not_block_kernel() !associated !1 {
				; CHECK-LABEL: define {{[^@]+}}@not_block_kernel() !associated !1 {
				; CHECK-NEXT: ret void
				;
				ret void
				}

				define internal amdgpu_kernel void @associated_null() !associated !2 {
				; CHECK-LABEL: define {{[^@]+}}@associated_null() !associated !2 {
				; CHECK-NEXT: ret void
				;
				ret void
				}

				define internal amdgpu_kernel void @no_metadata() {
				; CHECK-LABEL: define {{[^@]+}}@no_metadata() {
				; CHECK-NEXT: ret void
				;
				ret void
				}

				!0 = !{ptr addrspace(1) @block.handle }
				!1 = !{ptr addrspace(1) @not.a.block.handle }
				!2 = !{ptr addrspace(1) null }

				;.
				; CHECK: [[META0:![0-9]+]] = !{ptr addrspace(1) @block.handle}
				; CHECK: [[META1:![0-9]+]] = !{ptr addrspace(1) @not.a.block.handle}
				; CHECK: [[META2:![0-9]+]] = !{ptr addrspace(1) null}
				;.

llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll

This file was deleted.

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
	; RUN: opt -data-layout=A5 -amdgpu-lower-enqueued-block -S < %s \| FileCheck %s

	%struct.ndrange_t = type { i32 }
	%opencl.queue_t = type opaque

	define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
	ret void
	}

	define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
	entry:
	%block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
	%inst = alloca %struct.ndrange_t, align 4, addrspace(5)
	%block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
	%inst3 = alloca %struct.ndrange_t, align 4, addrspace(5)
	%block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0
	store i32 25, ptr addrspace(5) %block.size, align 8
	%block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1
	store i32 8, ptr addrspace(5) %block.align, align 4
	%block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2
	store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8
	%block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3
	store i8 %b, ptr addrspace(5) %block.captured1, align 8
	%inst4 = addrspacecast ptr addrspace(5) %block to ptr
	%inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
	ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
	%inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
	ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
	%inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
	ptr @0, ptr nonnull %inst4) #2
	%inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
	ptr @1, ptr nonnull %inst4) #2
	%block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0
	store i32 41, ptr addrspace(5) %block.size4, align 8
	%block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1
	store i32 8, ptr addrspace(5) %block.align5, align 4
	%block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2
	store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8
	%block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5
	store i8 %b, ptr addrspace(5) %block.captured8, align 8
	%block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3
	store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8
	%block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4
	store i64 %d, ptr addrspace(5) %block.captured10, align 8
	%inst8 = addrspacecast ptr addrspace(5) %block2 to ptr
	%inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3,
	ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2
	ret void
	}

	; __enqueue_kernel* functions may get inlined
	define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
	entry:
	%inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1))
	store i64 %inst, ptr addrspace(1) %c
	ret void
	}

	define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
	entry:
	%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
	%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
	store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
	ret void
	}

	declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr

	define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 {
	entry:
	%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2
	%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3
	%.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4
	%.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5
	store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1
	store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8
	ret void
	}

	@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ]

	define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
	entry:
	%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
	%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
	store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
	ret void
	}

	define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) {
	store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg
	ret void
	}

	define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
	ret void
	}

	define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
	ret void
	}

	attributes #0 = { "enqueued-block" }
	;.
	; CHECK: @[[KERNEL_ADDRESS_USER:[a-zA-Z0-9_$"\\.-]+]] = global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr)]
	; CHECK: @[[__TEST_BLOCK_INVOKE_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
	; CHECK: @[[__TEST_BLOCK_INVOKE_2_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
	; CHECK: @[[BLOCK_HAS_USED_KERNEL_ADDRESS_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
	; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
	; CHECK: @[[__AMDGPU_ENQUEUED_KERNEL_1_RUNTIME_HANDLE:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) externally_initialized constant [[BLOCK_RUNTIME_HANDLE_T:%.*]] zeroinitializer
	;.
	; CHECK-LABEL: define {{[^@]+}}@non_caller
	; CHECK-SAME: (ptr addrspace(1) [[A:%.]], i8 [[B:%.]], ptr addrspace(1) [[C:%.]], i64 [[D:%.]]) {
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@caller
	; CHECK-SAME: (ptr addrspace(1) [[A:%.]], i8 [[B:%.]], ptr addrspace(1) [[C:%.]], i64 [[D:%.]]) {
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[BLOCK:%.*]] = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
	; CHECK-NEXT: [[INST:%.]] = alloca [[STRUCT_NDRANGE_T:%.]], align 4, addrspace(5)
	; CHECK-NEXT: [[BLOCK2:%.*]] = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
	; CHECK-NEXT: [[INST3:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5)
	; CHECK-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 0
	; CHECK-NEXT: store i32 25, ptr addrspace(5) [[BLOCK_SIZE]], align 8
	; CHECK-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 1
	; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN]], align 4
	; CHECK-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 2
	; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED]], align 8
	; CHECK-NEXT: [[BLOCK_CAPTURED1:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) [[BLOCK]], i32 0, i32 3
	; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED1]], align 8
	; CHECK-NEXT: [[INST4:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK]] to ptr
	; CHECK-NEXT: [[INST5:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
	; CHECK-NEXT: [[INST10:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
	; CHECK-NEXT: [[INST11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.runtime_handle to ptr), ptr nonnull [[INST4]])
	; CHECK-NEXT: [[INST12:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST]], ptr addrspacecast (ptr addrspace(1) @__amdgpu_enqueued_kernel.1.runtime_handle to ptr), ptr nonnull [[INST4]])
	; CHECK-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 0
	; CHECK-NEXT: store i32 41, ptr addrspace(5) [[BLOCK_SIZE4]], align 8
	; CHECK-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 1
	; CHECK-NEXT: store i32 8, ptr addrspace(5) [[BLOCK_ALIGN5]], align 4
	; CHECK-NEXT: [[BLOCK_CAPTURED7:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 2
	; CHECK-NEXT: store ptr addrspace(1) [[A]], ptr addrspace(5) [[BLOCK_CAPTURED7]], align 8
	; CHECK-NEXT: [[BLOCK_CAPTURED8:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 5
	; CHECK-NEXT: store i8 [[B]], ptr addrspace(5) [[BLOCK_CAPTURED8]], align 8
	; CHECK-NEXT: [[BLOCK_CAPTURED9:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 3
	; CHECK-NEXT: store ptr addrspace(1) [[C]], ptr addrspace(5) [[BLOCK_CAPTURED9]], align 8
	; CHECK-NEXT: [[BLOCK_CAPTURED10:%.*]] = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) [[BLOCK2]], i32 0, i32 4
	; CHECK-NEXT: store i64 [[D]], ptr addrspace(5) [[BLOCK_CAPTURED10]], align 8
	; CHECK-NEXT: [[INST8:%.*]] = addrspacecast ptr addrspace(5) [[BLOCK2]] to ptr
	; CHECK-NEXT: [[INST9:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) nonnull byval([[STRUCT_NDRANGE_T]]) [[INST3]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime_handle to ptr), ptr nonnull [[INST8]])
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@inlined_caller
	; CHECK-SAME: (ptr addrspace(1) [[A:%.]], i8 [[B:%.]], ptr addrspace(1) [[C:%.]], i64 [[D:%.]]) {
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[INST:%.*]] = load i64, ptr addrspace(1) @__test_block_invoke_kernel.runtime_handle, align 4
	; CHECK-NEXT: store i64 [[INST]], ptr addrspace(1) [[C]], align 4
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_kernel
	; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2
	; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3
	; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@__test_block_invoke_2_kernel
	; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG:%.*]]) #[[ATTR1:[0-9]+]] {
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 2
	; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 3
	; CHECK-NEXT: [[DOTFCA_5_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 4
	; CHECK-NEXT: [[DOTFCA_6_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> [[ARG]], 5
	; CHECK-NEXT: store i8 [[DOTFCA_6_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
	; CHECK-NEXT: store i64 [[DOTFCA_5_EXTRACT]], ptr addrspace(1) [[DOTFCA_4_EXTRACT]], align 8
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@block_has_used_kernel_address
	; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
	; CHECK-NEXT: entry:
	; CHECK-NEXT: [[DOTFCA_3_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 2
	; CHECK-NEXT: [[DOTFCA_4_EXTRACT:%.*]] = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> [[ARG]], 3
	; CHECK-NEXT: store i8 [[DOTFCA_4_EXTRACT]], ptr addrspace(1) [[DOTFCA_3_EXTRACT]], align 1
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@user_of_kernel_address
	; CHECK-SAME: (ptr addrspace(1) [[ARG:%.*]]) {
	; CHECK-NEXT: store ptr addrspacecast (ptr addrspace(1) @block_has_used_kernel_address.runtime_handle to ptr), ptr addrspace(1) [[ARG]], align 8
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel
	; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR3:[0-9]+]] {
	; CHECK-NEXT: ret void
	;
	;
	; CHECK-LABEL: define {{[^@]+}}@__amdgpu_enqueued_kernel.1
	; CHECK-SAME: (<{ i32, i32, ptr addrspace(1), i8 }> [[ARG:%.*]]) #[[ATTR4:[0-9]+]] {
	; CHECK-NEXT: ret void
	;
	;.
	; CHECK: attributes #[[ATTR0]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_kernel.runtime_handle" }
	; CHECK: attributes #[[ATTR1]] = { "enqueued-block" "runtime-handle"="__test_block_invoke_2_kernel.runtime_handle" }
	; CHECK: attributes #[[ATTR2]] = { "enqueued-block" "runtime-handle"="block_has_used_kernel_address.runtime_handle" }
	; CHECK: attributes #[[ATTR3]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.runtime_handle" }
	; CHECK: attributes #[[ATTR4]] = { "enqueued-block" "runtime-handle"="__amdgpu_enqueued_kernel.1.runtime_handle" }
	;.

llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll

	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s \| llvm-readelf --notes - \| FileCheck --check-prefix=CHECK %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s
	; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s			; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 \| FileCheck --check-prefix=PARSER %s

	%struct.A = type { i8, float }			%struct.A = type { i8, float }
	%opencl.image1d_t = type opaque			%opencl.image1d_t = type opaque
	%opencl.image2d_t = type opaque			%opencl.image2d_t = type opaque
	%opencl.image3d_t = type opaque			%opencl.image3d_t = type opaque
	%opencl.queue_t = type opaque			%opencl.queue_t = type opaque
	%opencl.pipe_t = type opaque			%opencl.pipe_t = type opaque
	%struct.B = type { ptr addrspace(1) }			%struct.B = type { ptr addrspace(1) }
	%opencl.clk_event_t = type opaque			%opencl.clk_event_t = type opaque

	@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1)			@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant ptr addrspace(1), section ".amdgpu.kernel.runtime.handle"
				@not.a.handle = external addrspace(1) externally_initialized constant ptr addrspace(1)

	; CHECK: ---			; CHECK: ---
	; CHECK-NEXT: amdhsa.kernels:			; CHECK-NEXT: amdhsa.kernels:
	; CHECK-NEXT: - .args:			; CHECK-NEXT: - .args:
	; CHECK-NEXT: - .name: a			; CHECK-NEXT: - .name: a
	; CHECK-NEXT: .offset: 0			; CHECK-NEXT: .offset: 0
	; CHECK-NEXT: .size: 1			; CHECK-NEXT: .size: 1
	; CHECK-NEXT: .type_name: char			; CHECK-NEXT: .type_name: char
	▲ Show 20 Lines • Show All 1,647 Lines • ▼ Show 20 Lines
	; CHECK: .device_enqueue_symbol: __test_block_invoke_kernel_runtime_handle			; CHECK: .device_enqueue_symbol: __test_block_invoke_kernel_runtime_handle
	; CHECK: .language: OpenCL C			; CHECK: .language: OpenCL C
	; CHECK-NEXT: .language_version:			; CHECK-NEXT: .language_version:
	; CHECK-NEXT: - 2			; CHECK-NEXT: - 2
	; CHECK-NEXT: - 0			; CHECK-NEXT: - 0
	; CHECK: .name: __test_block_invoke_kernel			; CHECK: .name: __test_block_invoke_kernel
	; CHECK: .symbol: __test_block_invoke_kernel.kd			; CHECK: .symbol: __test_block_invoke_kernel.kd
	define amdgpu_kernel void @__test_block_invoke_kernel(			define amdgpu_kernel void @__test_block_invoke_kernel(
	<{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1			<{ i32, i32, ptr, ptr addrspace(1), i8 }> %arg) #1 !associated !112
	!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110			!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110
	!kernel_arg_base_type !110 !kernel_arg_type_qual !4 {			!kernel_arg_base_type !110 !kernel_arg_type_qual !4 {
	ret void			ret void
	}			}

	; CHECK: - .args:			; CHECK: - .args:
	; CHECK-NEXT: - .name: a			; CHECK-NEXT: - .name: a
	; CHECK-NEXT: .offset: 0			; CHECK-NEXT: .offset: 0
	Show All 39 Lines
	; CHECK-NEXT: .size: 8			; CHECK-NEXT: .size: 8
	; CHECK-NEXT: .value_kind: global_buffer			; CHECK-NEXT: .value_kind: global_buffer
	; CHECK: .name: unknown_addrspace_kernarg			; CHECK: .name: unknown_addrspace_kernarg
	; CHECK: .symbol: unknown_addrspace_kernarg.kd			; CHECK: .symbol: unknown_addrspace_kernarg.kd
	define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) #0 {			define amdgpu_kernel void @unknown_addrspace_kernarg(ptr addrspace(12345) %ptr) #0 {
	ret void			ret void
	}			}

				; Make sure the device_enqueue_symbol is not reported
				; CHECK: - .args: []
				; CHECK-NEXT: .group_segment_fixed_size: 0
				; CHECK-NEXT: .kernarg_segment_align: 4
				; CHECK-NEXT: .kernarg_segment_size: 0
				; CHECK-NEXT: .language: OpenCL C
				; CHECK-NEXT: .language_version:
				; CHECK-NEXT: - 2
				; CHECK-NEXT: - 0
				; CHECK-NEXT: .max_flat_workgroup_size: 1024
				; CHECK-NEXT: .name: associated_global_not_handle
				; CHECK-NEXT: .private_segment_fixed_size: 0
				; CHECK-NEXT: .sgpr_count:
				; CHECK-NEXT: .sgpr_spill_count: 0
				; CHECK-NEXT: .symbol: associated_global_not_handle.kd
				; CHECK-NEXT: .vgpr_count:
				; CHECK-NEXT: .vgpr_spill_count: 0
				; CHECK-NEXT: .wavefront_size: 64
				; CHECK-NOT: device_enqueue_symbol
				define amdgpu_kernel void @associated_global_not_handle() #3 !associated !113 {
				ret void
				}

	; CHECK: amdhsa.printf:			; CHECK: amdhsa.printf:
	; CHECK-NEXT: - '1:1:4:%d\n'			; CHECK-NEXT: - '1:1:4:%d\n'
	; CHECK-NEXT: - '2:1:8:%g\n'			; CHECK-NEXT: - '2:1:8:%g\n'
	; CHECK: amdhsa.version:			; CHECK: amdhsa.version:
	; CHECK-NEXT: - 1			; CHECK-NEXT: - 1
	; CHECK-NEXT: - 1			; CHECK-NEXT: - 1

	attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" }			attributes #0 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" }
	attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }			attributes #1 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
	attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }			attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
				attributes #3 = { optnone noinline "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }

	!llvm.module.flags = !{!0}			!llvm.module.flags = !{!0}
	!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}			!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}

	!llvm.printf.fmts = !{!100, !101}			!llvm.printf.fmts = !{!100, !101}

	!1 = !{i32 0}			!1 = !{i32 0}
	!2 = !{!"none"}			!2 = !{!"none"}
	▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines
	!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}			!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
	!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}			!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
	!93 = !{!"long addrspace(5)", !"char addrspace(5)", !"char2 addrspace(5)", !"char3 addrspace(5)", !"char4 addrspace(5)", !"char8 addrspace(5)", !"char16 addrspace(5)*"}			!93 = !{!"long addrspace(5)", !"char addrspace(5)", !"char2 addrspace(5)", !"char3 addrspace(5)", !"char4 addrspace(5)", !"char8 addrspace(5)", !"char16 addrspace(5)*"}
	!94 = !{!"", !"", !"", !"", !"", !"", !""}			!94 = !{!"", !"", !"", !"", !"", !"", !""}
	!100 = !{!"1:1:4:%d\5Cn"}			!100 = !{!"1:1:4:%d\5Cn"}
	!101 = !{!"2:1:8:%g\5Cn"}			!101 = !{!"2:1:8:%g\5Cn"}
	!110 = !{!"__block_literal"}			!110 = !{!"__block_literal"}
	!111 = !{!"char", !"char"}			!111 = !{!"char", !"char"}
				!112 = !{ptr addrspace(1) @__test_block_invoke_kernel_runtime_handle }
				!113 = !{ptr addrspace(1) @not.a.handle }

	; PARSER: AMDGPU HSA Metadata Parser Test: PASS			; PARSER: AMDGPU HSA Metadata Parser Test: PASS

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

	Show All 31 Lines
	; GCN-O0-NEXT: AMDGPU Printf lowering			; GCN-O0-NEXT: AMDGPU Printf lowering
	; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU			; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU
	; GCN-O0-NEXT: AMDGPU Inline All Functions			; GCN-O0-NEXT: AMDGPU Inline All Functions
	; GCN-O0-NEXT: Inliner for always_inline functions			; GCN-O0-NEXT: Inliner for always_inline functions
	; GCN-O0-NEXT: FunctionPass Manager			; GCN-O0-NEXT: FunctionPass Manager
	; GCN-O0-NEXT: Dominator Tree Construction			; GCN-O0-NEXT: Dominator Tree Construction
	; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl)			; GCN-O0-NEXT: Basic Alias Analysis (stateless AA impl)
	; GCN-O0-NEXT: Function Alias Analysis Results			; GCN-O0-NEXT: Function Alias Analysis Results
	; GCN-O0-NEXT: Lower OpenCL enqueued blocks			; GCN-O0-NEXT: Externalize enqueued block runtime handles
	; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions			; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions
	; GCN-O0-NEXT: FunctionPass Manager			; GCN-O0-NEXT: FunctionPass Manager
	; GCN-O0-NEXT: Expand Atomic instructions			; GCN-O0-NEXT: Expand Atomic instructions
	; GCN-O0-NEXT: Lower constant intrinsics			; GCN-O0-NEXT: Lower constant intrinsics
	; GCN-O0-NEXT: Remove unreachable blocks from the CFG			; GCN-O0-NEXT: Remove unreachable blocks from the CFG
	; GCN-O0-NEXT: Expand vector predication intrinsics			; GCN-O0-NEXT: Expand vector predication intrinsics
	; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics			; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
	; GCN-O0-NEXT: Expand reduction intrinsics			; GCN-O0-NEXT: Expand reduction intrinsics
	▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines
	; GCN-O1-NEXT: AMDGPU Printf lowering			; GCN-O1-NEXT: AMDGPU Printf lowering
	; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU			; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU
	; GCN-O1-NEXT: AMDGPU Inline All Functions			; GCN-O1-NEXT: AMDGPU Inline All Functions
	; GCN-O1-NEXT: Inliner for always_inline functions			; GCN-O1-NEXT: Inliner for always_inline functions
	; GCN-O1-NEXT: FunctionPass Manager			; GCN-O1-NEXT: FunctionPass Manager
	; GCN-O1-NEXT: Dominator Tree Construction			; GCN-O1-NEXT: Dominator Tree Construction
	; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)			; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
	; GCN-O1-NEXT: Function Alias Analysis Results			; GCN-O1-NEXT: Function Alias Analysis Results
	; GCN-O1-NEXT: Lower OpenCL enqueued blocks			; GCN-O1-NEXT: Externalize enqueued block runtime handles
	; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions			; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions
	; GCN-O1-NEXT: AMDGPU Attributor			; GCN-O1-NEXT: AMDGPU Attributor
	; GCN-O1-NEXT: FunctionPass Manager			; GCN-O1-NEXT: FunctionPass Manager
	; GCN-O1-NEXT: Cycle Info Analysis			; GCN-O1-NEXT: Cycle Info Analysis
	; GCN-O1-NEXT: FunctionPass Manager			; GCN-O1-NEXT: FunctionPass Manager
	; GCN-O1-NEXT: Infer address spaces			; GCN-O1-NEXT: Infer address spaces
	; GCN-O1-NEXT: Dominator Tree Construction			; GCN-O1-NEXT: Dominator Tree Construction
	; GCN-O1-NEXT: Cycle Info Analysis			; GCN-O1-NEXT: Cycle Info Analysis
	▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines
	; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering			; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering
	; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU			; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU
	; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions			; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions
	; GCN-O1-OPTS-NEXT: Inliner for always_inline functions			; GCN-O1-OPTS-NEXT: Inliner for always_inline functions
	; GCN-O1-OPTS-NEXT: FunctionPass Manager			; GCN-O1-OPTS-NEXT: FunctionPass Manager
	; GCN-O1-OPTS-NEXT: Dominator Tree Construction			; GCN-O1-OPTS-NEXT: Dominator Tree Construction
	; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)			; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl)
	; GCN-O1-OPTS-NEXT: Function Alias Analysis Results			; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
	; GCN-O1-OPTS-NEXT: Lower OpenCL enqueued blocks			; GCN-O1-OPTS-NEXT: Externalize enqueued block runtime handles
	; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions			; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions
	; GCN-O1-OPTS-NEXT: AMDGPU Attributor			; GCN-O1-OPTS-NEXT: AMDGPU Attributor
	; GCN-O1-OPTS-NEXT: FunctionPass Manager			; GCN-O1-OPTS-NEXT: FunctionPass Manager
	; GCN-O1-OPTS-NEXT: Cycle Info Analysis			; GCN-O1-OPTS-NEXT: Cycle Info Analysis
	; GCN-O1-OPTS-NEXT: FunctionPass Manager			; GCN-O1-OPTS-NEXT: FunctionPass Manager
	; GCN-O1-OPTS-NEXT: Infer address spaces			; GCN-O1-OPTS-NEXT: Infer address spaces
	; GCN-O1-OPTS-NEXT: Dominator Tree Construction			; GCN-O1-OPTS-NEXT: Dominator Tree Construction
	; GCN-O1-OPTS-NEXT: Cycle Info Analysis			; GCN-O1-OPTS-NEXT: Cycle Info Analysis
	▲ Show 20 Lines • Show All 274 Lines • ▼ Show 20 Lines
	; GCN-O2-NEXT: FunctionPass Manager			; GCN-O2-NEXT: FunctionPass Manager
	; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer			; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer
	; GCN-O2-NEXT: AMDGPU Inline All Functions			; GCN-O2-NEXT: AMDGPU Inline All Functions
	; GCN-O2-NEXT: Inliner for always_inline functions			; GCN-O2-NEXT: Inliner for always_inline functions
	; GCN-O2-NEXT: FunctionPass Manager			; GCN-O2-NEXT: FunctionPass Manager
	; GCN-O2-NEXT: Dominator Tree Construction			; GCN-O2-NEXT: Dominator Tree Construction
	; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)			; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl)
	; GCN-O2-NEXT: Function Alias Analysis Results			; GCN-O2-NEXT: Function Alias Analysis Results
	; GCN-O2-NEXT: Lower OpenCL enqueued blocks			; GCN-O2-NEXT: Externalize enqueued block runtime handles
	; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions			; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions
	; GCN-O2-NEXT: AMDGPU Attributor			; GCN-O2-NEXT: AMDGPU Attributor
	; GCN-O2-NEXT: FunctionPass Manager			; GCN-O2-NEXT: FunctionPass Manager
	; GCN-O2-NEXT: Cycle Info Analysis			; GCN-O2-NEXT: Cycle Info Analysis
	; GCN-O2-NEXT: FunctionPass Manager			; GCN-O2-NEXT: FunctionPass Manager
	; GCN-O2-NEXT: Infer address spaces			; GCN-O2-NEXT: Infer address spaces
	; GCN-O2-NEXT: Dominator Tree Construction			; GCN-O2-NEXT: Dominator Tree Construction
	; GCN-O2-NEXT: Cycle Info Analysis			; GCN-O2-NEXT: Cycle Info Analysis
	▲ Show 20 Lines • Show All 284 Lines • ▼ Show 20 Lines
	; GCN-O3-NEXT: FunctionPass Manager			; GCN-O3-NEXT: FunctionPass Manager
	; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer			; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer
	; GCN-O3-NEXT: AMDGPU Inline All Functions			; GCN-O3-NEXT: AMDGPU Inline All Functions
	; GCN-O3-NEXT: Inliner for always_inline functions			; GCN-O3-NEXT: Inliner for always_inline functions
	; GCN-O3-NEXT: FunctionPass Manager			; GCN-O3-NEXT: FunctionPass Manager
	; GCN-O3-NEXT: Dominator Tree Construction			; GCN-O3-NEXT: Dominator Tree Construction
	; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)			; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl)
	; GCN-O3-NEXT: Function Alias Analysis Results			; GCN-O3-NEXT: Function Alias Analysis Results
	; GCN-O3-NEXT: Lower OpenCL enqueued blocks			; GCN-O3-NEXT: Externalize enqueued block runtime handles
	; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions			; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions
	; GCN-O3-NEXT: AMDGPU Attributor			; GCN-O3-NEXT: AMDGPU Attributor
	; GCN-O3-NEXT: FunctionPass Manager			; GCN-O3-NEXT: FunctionPass Manager
	; GCN-O3-NEXT: Cycle Info Analysis			; GCN-O3-NEXT: Cycle Info Analysis
	; GCN-O3-NEXT: FunctionPass Manager			; GCN-O3-NEXT: FunctionPass Manager
	; GCN-O3-NEXT: Infer address spaces			; GCN-O3-NEXT: Infer address spaces
	; GCN-O3-NEXT: Dominator Tree Construction			; GCN-O3-NEXT: Dominator Tree Construction
	; GCN-O3-NEXT: Cycle Info Analysis			; GCN-O3-NEXT: Cycle Info Analysis
	▲ Show 20 Lines • Show All 275 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Move enqueued block handling into clang
AcceptedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 558095

clang/lib/CodeGen/Targets/AMDGPU.cpp

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

llvm/docs/AMDGPUUsage.rst

llvm/lib/IR/AutoUpgrade.cpp

llvm/lib/IR/CMakeLists.txt

llvm/lib/Target/AMDGPU/AMDGPU.h

llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/lib/Target/AMDGPU/CMakeLists.txt

llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll

llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll

llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Move enqueued block handling into clangAcceptedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 558095

clang/lib/CodeGen/Targets/AMDGPU.cpp

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel-linking.cl

clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl

llvm/docs/AMDGPUUsage.rst

llvm/lib/IR/AutoUpgrade.cpp

llvm/lib/IR/CMakeLists.txt

llvm/lib/Target/AMDGPU/AMDGPU.h

llvm/lib/Target/AMDGPU/AMDGPUExportKernelRuntimeHandles.cpp

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h

llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp

llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/lib/Target/AMDGPU/CMakeLists.txt

llvm/test/CodeGen/AMDGPU/amdgpu-export-kernel-runtime-handles.ll

llvm/test/CodeGen/AMDGPU/enqueue-kernel.ll

llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full.ll

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

AMDGPU: Move enqueued block handling into clang
AcceptedPublic