Index: cfe/trunk/lib/CodeGen/CGBlocks.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBlocks.cpp +++ cfe/trunk/lib/CodeGen/CGBlocks.cpp @@ -738,16 +738,27 @@ } /// Emit a block literal expression in the current function. -llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr) { +llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr, + llvm::Function **InvokeF) { // If the block has no captures, we won't have a pre-computed // layout for it. if (!blockExpr->getBlockDecl()->hasCaptures()) { - if (llvm::Constant *Block = CGM.getAddrOfGlobalBlockIfEmitted(blockExpr)) + // The block literal is emitted as a global variable, and the block invoke + // function has to be extracted from its initializer. + if (llvm::Constant *Block = CGM.getAddrOfGlobalBlockIfEmitted(blockExpr)) { + if (InvokeF) { + auto *GV = cast( + cast(Block)->stripPointerCasts()); + auto *BlockInit = cast(GV->getInitializer()); + *InvokeF = cast( + BlockInit->getAggregateElement(2)->stripPointerCasts()); + } return Block; + } CGBlockInfo blockInfo(blockExpr->getBlockDecl(), CurFn->getName()); computeBlockInfo(CGM, this, blockInfo); blockInfo.BlockExpression = blockExpr; - return EmitBlockLiteral(blockInfo); + return EmitBlockLiteral(blockInfo, InvokeF); } // Find the block info for this block and take ownership of it. @@ -756,10 +767,11 @@ blockExpr->getBlockDecl())); blockInfo->BlockExpression = blockExpr; - return EmitBlockLiteral(*blockInfo); + return EmitBlockLiteral(*blockInfo, InvokeF); } -llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) { +llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo, + llvm::Function **InvokeF) { bool IsOpenCL = CGM.getContext().getLangOpts().OpenCL; auto GenVoidPtrTy = IsOpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy; @@ -768,9 +780,11 @@ CGM.getTarget().getPointerWidth(GenVoidPtrAddr) / 8); // Using the computed layout, generate the actual block function. bool isLambdaConv = blockInfo.getBlockDecl()->isConversionFromLambda(); - llvm::Constant *blockFn = CodeGenFunction(CGM, true).GenerateBlockFunction( + auto *InvokeFn = CodeGenFunction(CGM, true).GenerateBlockFunction( CurGD, blockInfo, LocalDeclMap, isLambdaConv, blockInfo.CanBeGlobal); - blockFn = llvm::ConstantExpr::getPointerCast(blockFn, GenVoidPtrTy); + if (InvokeF) + *InvokeF = InvokeFn; + auto *blockFn = llvm::ConstantExpr::getPointerCast(InvokeFn, GenVoidPtrTy); // If there is nothing to capture, we can emit this as a global block. if (blockInfo.CanBeGlobal) Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -2779,12 +2779,17 @@ // The most basic form of the call with parameters: // queue_t, kernel_enqueue_flags_t, ndrange_t, block(void) Name = "__enqueue_kernel_basic"; - llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy}; + llvm::Type *ArgTys[] = {QueueTy, Int32Ty, RangeTy, GenericVoidPtrTy, + GenericVoidPtrTy}; llvm::FunctionType *FTy = llvm::FunctionType::get( - Int32Ty, llvm::ArrayRef(ArgTys, 4), false); + Int32Ty, llvm::ArrayRef(ArgTys), false); - llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(3)), GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)); + llvm::Value *Kernel = + Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + llvm::Value *Block = + Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); AttrBuilder B; B.addAttribute(Attribute::ByVal); @@ -2793,7 +2798,7 @@ auto RTCall = Builder.CreateCall(CGM.CreateRuntimeFunction(FTy, Name, ByValAttrSet), - {Queue, Flags, Range, Block}); + {Queue, Flags, Range, Kernel, Block}); RTCall->setAttributes(ByValAttrSet); return RValue::get(RTCall); } @@ -2825,21 +2830,23 @@ if (E->getArg(3)->getType()->isBlockPointerType()) { // No events passed, but has variadic arguments. Name = "__enqueue_kernel_vaargs"; - auto *Block = Builder.CreatePointerCast(EmitScalarExpr(E->getArg(3)), - GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)); + llvm::Value *Kernel = + Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + auto *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); auto *PtrToSizeArray = CreateArrayForSizeVar(4); // Create a vector of the arguments, as well as a constant value to // express to the runtime the number of variadic arguments. - std::vector Args = {Queue, - Flags, - Range, - Block, - ConstantInt::get(IntTy, NumArgs - 4), - PtrToSizeArray}; - std::vector ArgTys = {QueueTy, IntTy, - RangeTy, GenericVoidPtrTy, - IntTy, PtrToSizeArray->getType()}; + std::vector Args = { + Queue, Flags, Range, + Kernel, Block, ConstantInt::get(IntTy, NumArgs - 4), + PtrToSizeArray}; + std::vector ArgTys = { + QueueTy, IntTy, RangeTy, + GenericVoidPtrTy, GenericVoidPtrTy, IntTy, + PtrToSizeArray->getType()}; llvm::FunctionType *FTy = llvm::FunctionType::get( Int32Ty, llvm::ArrayRef(ArgTys), false); @@ -2863,15 +2870,19 @@ // Convert to generic address space. EventList = Builder.CreatePointerCast(EventList, EventPtrTy); ClkEvent = Builder.CreatePointerCast(ClkEvent, EventPtrTy); - llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(6)), GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6)); + llvm::Value *Kernel = + Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + llvm::Value *Block = + Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); std::vector ArgTys = { - QueueTy, Int32Ty, RangeTy, Int32Ty, - EventPtrTy, EventPtrTy, GenericVoidPtrTy}; + QueueTy, Int32Ty, RangeTy, Int32Ty, + EventPtrTy, EventPtrTy, GenericVoidPtrTy, GenericVoidPtrTy}; - std::vector Args = {Queue, Flags, Range, NumEvents, - EventList, ClkEvent, Block}; + std::vector Args = {Queue, Flags, Range, NumEvents, + EventList, ClkEvent, Kernel, Block}; if (NumArgs == 7) { // Has events but no variadics. @@ -2905,24 +2916,30 @@ case Builtin::BIget_kernel_work_group_size: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); - Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); + Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, GenericVoidPtrTy, false), + llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy}, + false), "__get_kernel_work_group_size_impl"), - Arg)); + {Kernel, Arg})); } case Builtin::BIget_kernel_preferred_work_group_size_multiple: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); - Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); + Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + Value *Arg = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, GenericVoidPtrTy, false), + llvm::FunctionType::get(IntTy, {GenericVoidPtrTy, GenericVoidPtrTy}, + false), "__get_kernel_preferred_work_group_multiple_impl"), - Arg)); + {Kernel, Arg})); } case Builtin::BIget_kernel_max_sub_group_size_for_ndrange: case Builtin::BIget_kernel_sub_group_count_for_ndrange: { @@ -2930,8 +2947,10 @@ getContext().getTargetAddressSpace(LangAS::opencl_generic)); LValue NDRangeL = EmitAggExprToLValue(E->getArg(0)); llvm::Value *NDRange = NDRangeL.getAddress().getPointer(); - Value *Block = EmitScalarExpr(E->getArg(1)); - Block = Builder.CreatePointerCast(Block, GenericVoidPtrTy); + auto Info = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1)); + Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy); + Value *Block = Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy); const char *Name = BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange ? "__get_kernel_max_sub_group_size_for_ndrange_impl" @@ -2939,9 +2958,10 @@ return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( llvm::FunctionType::get( - IntTy, {NDRange->getType(), GenericVoidPtrTy}, false), + IntTy, {NDRange->getType(), GenericVoidPtrTy, GenericVoidPtrTy}, + false), Name), - {NDRange, Block})); + {NDRange, Kernel, Block})); } case Builtin::BI__builtin_store_half: Index: cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h +++ cfe/trunk/lib/CodeGen/CGOpenCLRuntime.h @@ -17,11 +17,13 @@ #define LLVM_CLANG_LIB_CODEGEN_CGOPENCLRUNTIME_H #include "clang/AST/Type.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" namespace clang { +class Expr; class VarDecl; namespace CodeGen { @@ -35,6 +37,14 @@ llvm::Type *PipeTy; llvm::PointerType *SamplerTy; + /// Structure for enqueued block information. + struct EnqueuedBlockInfo { + llvm::Function *Kernel; /// Enqueued block kernel. + llvm::Value *BlockArg; /// The first argument to enqueued block kernel. + }; + /// Maps block expression to block information. + llvm::DenseMap EnqueuedBlockMap; + public: CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr), SamplerTy(nullptr) {} @@ -62,6 +72,10 @@ /// \return __generic void* type. llvm::PointerType *getGenericVoidPointerType(); + + /// \return enqueued block information for enqueued block. + EnqueuedBlockInfo emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E); }; } Index: cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp +++ cfe/trunk/lib/CodeGen/CGOpenCLRuntime.cpp @@ -16,6 +16,7 @@ #include "CGOpenCLRuntime.h" #include "CodeGenFunction.h" #include "TargetInfo.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalValue.h" #include @@ -110,3 +111,38 @@ CGM.getLLVMContext(), CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic)); } + +CGOpenCLRuntime::EnqueuedBlockInfo +CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E) { + // The block literal may be assigned to a const variable. Chasing down + // to get the block literal. + if (auto DR = dyn_cast(E)) { + E = cast(DR->getDecl())->getInit(); + } + if (auto Cast = dyn_cast(E)) { + E = Cast->getSubExpr(); + } + auto *Block = cast(E); + + // The same block literal may be enqueued multiple times. Cache it if + // possible. + auto Loc = EnqueuedBlockMap.find(Block); + if (Loc != EnqueuedBlockMap.end()) { + return Loc->second; + } + + // Emit block literal as a common block expression and get the block invoke + // function. + llvm::Function *Invoke; + auto *V = CGF.EmitBlockLiteral(cast(Block), &Invoke); + auto *F = CGF.getTargetHooks().createEnqueuedBlockKernel( + CGF, Invoke, V->stripPointerCasts()); + + // The common part of the post-processing of the kernel goes here. + F->addFnAttr(llvm::Attribute::NoUnwind); + F->setCallingConv( + CGF.getTypes().ClangCallConvToLLVMCallConv(CallingConv::CC_OpenCLKernel)); + EnqueuedBlockInfo Info{F, V}; + EnqueuedBlockMap[Block] = Info; + return Info; +} Index: cfe/trunk/lib/CodeGen/CodeGenFunction.h =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenFunction.h +++ cfe/trunk/lib/CodeGen/CodeGenFunction.h @@ -1584,7 +1584,14 @@ // Block Bits //===--------------------------------------------------------------------===// - llvm::Value *EmitBlockLiteral(const BlockExpr *); + /// Emit block literal. + /// \return an LLVM value which is a pointer to a struct which contains + /// information about the block, including the block invoke function, the + /// captured variables, etc. + /// \param InvokeF will contain the block invoke function if it is not + /// nullptr. + llvm::Value *EmitBlockLiteral(const BlockExpr *, + llvm::Function **InvokeF = nullptr); static void destroyBlockInfos(CGBlockInfo *info); llvm::Function *GenerateBlockFunction(GlobalDecl GD, @@ -2914,8 +2921,11 @@ LValue EmitOMPSharedLValue(const Expr *E); private: - /// Helpers for blocks - llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info); + /// Helpers for blocks. Returns invoke function by \p InvokeF if it is not + /// nullptr. It should be called without \p InvokeF if the caller does not + /// need invoke function to be returned. + llvm::Value *EmitBlockLiteral(const CGBlockInfo &Info, + llvm::Function **InvokeF = nullptr); /// Helpers for the OpenMP loop directives. void EmitOMPSimdInit(const OMPLoopDirective &D, bool IsMonotonic = false); Index: cfe/trunk/lib/CodeGen/CodeGenTypes.h =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenTypes.h +++ cfe/trunk/lib/CodeGen/CodeGenTypes.h @@ -164,8 +164,6 @@ llvm::SmallSet RecordsWithOpaqueMemberPointers; - unsigned ClangCallConvToLLVMCallConv(CallingConv CC); - public: CodeGenTypes(CodeGenModule &cgm); ~CodeGenTypes(); @@ -180,6 +178,9 @@ llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); } const CodeGenOptions &getCodeGenOpts() const; + /// Convert clang calling convention to LLVM callilng convention. + unsigned ClangCallConvToLLVMCallConv(CallingConv CC); + /// ConvertType - Convert type T into a llvm::Type. llvm::Type *ConvertType(QualType T); Index: cfe/trunk/lib/CodeGen/TargetInfo.h =================================================================== --- cfe/trunk/lib/CodeGen/TargetInfo.h +++ cfe/trunk/lib/CodeGen/TargetInfo.h @@ -287,6 +287,16 @@ virtual TargetOpenCLBlockHelper *getTargetOpenCLBlockHelper() const { return nullptr; } + + /// Create an OpenCL kernel for an enqueued block. The kernel function is + /// a wrapper for the block invoke function with target-specific calling + /// convention and ABI as an OpenCL kernel. The wrapper function accepts + /// block context and block arguments in target-specific way and calls + /// the original block invoke function. + virtual llvm::Function * + createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *BlockInvokeFunc, + llvm::Value *BlockLiteral) const; }; } // namespace CodeGen Index: cfe/trunk/lib/CodeGen/TargetInfo.cpp =================================================================== --- cfe/trunk/lib/CodeGen/TargetInfo.cpp +++ cfe/trunk/lib/CodeGen/TargetInfo.cpp @@ -14,6 +14,7 @@ #include "TargetInfo.h" #include "ABIInfo.h" +#include "CGBlocks.h" #include "CGCXXABI.h" #include "CGValue.h" #include "CodeGenFunction.h" @@ -7617,6 +7618,10 @@ const VarDecl *D) const override; llvm::SyncScope::ID getLLVMSyncScopeID(SyncScope S, llvm::LLVMContext &C) const override; + llvm::Function * + createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *BlockInvokeFunc, + llvm::Value *BlockLiteral) const override; }; } @@ -8917,3 +8922,109 @@ return SetCGInfo(new SPIRTargetCodeGenInfo(Types)); } } + +/// Create an OpenCL kernel for an enqueued block. +/// +/// The kernel has the same function type as the block invoke function. Its +/// name is the name of the block invoke function postfixed with "_kernel". +/// It simply calls the block invoke function then returns. +llvm::Function * +TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *Invoke, + llvm::Value *BlockLiteral) const { + auto *InvokeFT = Invoke->getFunctionType(); + llvm::SmallVector ArgTys; + for (auto &P : InvokeFT->params()) + ArgTys.push_back(P); + auto &C = CGF.getLLVMContext(); + std::string Name = Invoke->getName().str() + "_kernel"; + auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); + auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, + &CGF.CGM.getModule()); + auto IP = CGF.Builder.saveIP(); + auto *BB = llvm::BasicBlock::Create(C, "entry", F); + auto &Builder = CGF.Builder; + Builder.SetInsertPoint(BB); + llvm::SmallVector Args; + for (auto &A : F->args()) + Args.push_back(&A); + Builder.CreateCall(Invoke, Args); + Builder.CreateRetVoid(); + Builder.restoreIP(IP); + return F; +} + +/// Create an OpenCL kernel for an enqueued block. +/// +/// The type of the first argument (the block literal) is the struct type +/// of the block literal instead of a pointer type. The first argument +/// (block literal) is passed directly by value to the kernel. The kernel +/// allocates the same type of struct on stack and stores the block literal +/// to it and passes its pointer to the block invoke function. The kernel +/// has "enqueued-block" function attribute and kernel argument metadata. +llvm::Function *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( + CodeGenFunction &CGF, llvm::Function *Invoke, + llvm::Value *BlockLiteral) const { + auto &Builder = CGF.Builder; + auto &C = CGF.getLLVMContext(); + + auto *BlockTy = BlockLiteral->getType()->getPointerElementType(); + auto *InvokeFT = Invoke->getFunctionType(); + llvm::SmallVector ArgTys; + llvm::SmallVector AddressQuals; + llvm::SmallVector AccessQuals; + llvm::SmallVector ArgTypeNames; + llvm::SmallVector ArgBaseTypeNames; + llvm::SmallVector ArgTypeQuals; + llvm::SmallVector ArgNames; + + ArgTys.push_back(BlockTy); + ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); + AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); + ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal")); + ArgTypeQuals.push_back(llvm::MDString::get(C, "")); + AccessQuals.push_back(llvm::MDString::get(C, "none")); + ArgNames.push_back(llvm::MDString::get(C, "block_literal")); + for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { + ArgTys.push_back(InvokeFT->getParamType(I)); + ArgTys.push_back(BlockTy); + ArgTypeNames.push_back(llvm::MDString::get(C, "void*")); + AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); + AccessQuals.push_back(llvm::MDString::get(C, "none")); + ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*")); + ArgTypeQuals.push_back(llvm::MDString::get(C, "")); + ArgNames.push_back( + llvm::MDString::get(C, std::string("local_arg") + std::to_string(I))); + } + std::string Name = Invoke->getName().str() + "_kernel"; + auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); + auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, + &CGF.CGM.getModule()); + F->addFnAttr("enqueued-block"); + auto IP = CGF.Builder.saveIP(); + auto *BB = llvm::BasicBlock::Create(C, "entry", F); + Builder.SetInsertPoint(BB); + unsigned BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlignment(BlockTy); + auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); + BlockPtr->setAlignment(BlockAlign); + Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign); + auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); + llvm::SmallVector Args; + Args.push_back(Cast); + for (auto I = F->arg_begin() + 1, E = F->arg_end(); I != E; ++I) + Args.push_back(I); + Builder.CreateCall(Invoke, Args); + Builder.CreateRetVoid(); + Builder.restoreIP(IP); + + F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals)); + F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals)); + F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames)); + F->setMetadata("kernel_arg_base_type", + llvm::MDNode::get(C, ArgBaseTypeNames)); + F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals)); + if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) + F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames)); + + return F; +} Index: cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl =================================================================== --- cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ cfe/trunk/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -0,0 +1,36 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK + +typedef struct {int a;} ndrange_t; + +// CHECK-LABEL: define amdgpu_kernel void @test +kernel void test(global char *a, char b, global long *c, long d) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + }); + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + c[0] = d; + }); +} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>) +// CHECK-SAME: #[[ATTR:[0-9]+]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} +// CHECK: entry: +// CHECK: %1 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 +// CHECK: store <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %0, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1, align 8 +// CHECK: %2 = addrspacecast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %1 to i8 addrspace(4)* +// CHECK: call void @__test_block_invoke(i8 addrspace(4)* %2) +// CHECK: ret void +// CHECK:} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>) +// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// CHECK: attributes #[[ATTR]] = { nounwind "enqueued-block" } Index: cfe/trunk/test/CodeGenOpenCL/blocks.cl =================================================================== --- cfe/trunk/test/CodeGenOpenCL/blocks.cl +++ cfe/trunk/test/CodeGenOpenCL/blocks.cl @@ -50,3 +50,5 @@ // COMMON: %[[block:.*]] = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 }> addrspace(4)* // COMMON: %[[block_capture_addr:.*]] = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }> addrspace(4)* %[[block]], i32 0, i32 3 // COMMON: %[[block_capture:.*]] = load i32, i32 addrspace(4)* %[[block_capture_addr]] + +// COMMON-NOT: define{{.*}}@__foo_block_invoke_kernel Index: cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl =================================================================== --- cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ cfe/trunk/test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,30 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG11:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* {{@[^ ]+}} to i8*) to i8 addrspace(4)*) } + +// Emits block literal [[BL_GLOBAL]], invoke function [[INV_G]] and global block variable @block_G +// COMMON: define internal spir_func void [[INV_G]](i8 addrspace(4)* %{{.*}}, i8 addrspace(3)* %{{.*}}) const bl_t block_G = (bl_t) ^ (local void *a) {}; +// COMMON-LABEL: define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %{{.*}}, i32 addrspace(1)* %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: %default_queue = alloca %opencl.queue_t* queue_t default_queue; @@ -24,62 +44,84 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; + // Emits block literal on stack and block kernel [[INVLK1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic( + // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK1:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* [[BL_I8]]) enqueue_kernel(default_queue, flags, ndrange, ^(void) { a[i] = b[i]; }); + // Emits block literal on stack and block kernel [[INVLK2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)* // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)* + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL2:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic_events + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVLK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* [[BL_I8]]) + enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, ^(void) { a[i] = b[i]; }); + // Emits global block literal [[BLG1]] and block kernel [[INVGK1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_vaargs( + // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK1:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 256); char c; + // Emits global block literal [[BLG2]] and block kernel [[INVGK2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_vaargs( + // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK2:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, c); + // Emits global block literal [[BLG3]] and block kernel [[INVGK3]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -88,17 +130,22 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_events_vaargs + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK3:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; }, 256); + // Emits global block literal [[BLG4]] and block kernel [[INVGK4]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -107,11 +154,15 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_events_vaargs + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK4:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; @@ -119,22 +170,28 @@ c); long l; + // Emits global block literal [[BLG5]] and block kernel [[INVGK5]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_vaargs + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK5:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG5]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, l); + // Emits global block literal [[BLG6]] and block kernel [[INVGK6]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [3 x i32] @@ -144,7 +201,6 @@ // B32: store i32 2, i32* %[[TMP2]], align 4 // B32: %[[TMP3:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 2 // B32: store i32 4, i32* %[[TMP3]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [3 x i64] // B64: %[[TMP1:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 1, i64* %[[TMP1]], align 8 @@ -152,52 +208,132 @@ // B64: store i64 2, i64* %[[TMP2]], align 8 // B64: %[[TMP3:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 2 // B64: store i64 4, i64* %[[TMP3]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_vaargs + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK6:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG6]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p1, local void *p2, local void *p3) { return; }, 1, 2, 4); + // Emits global block literal [[BLG7]] and block kernel [[INVGK7]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 0, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 4294967296, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_vaargs + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK7:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG7]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, + // B32-SAME: i32* %[[TMP1]]) + // B64-SAME: i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 4294967296L); + // Emits global block literal [[BLG8]] and invoke function [[INVG8]]. // The full type of these expressions are long (and repeated elsewhere), so we // capture it as part of the regex for convenience and clarity. - // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A:@__block_literal_global(\.[0-9]+)?]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A + // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A void (^const block_A)(void) = ^{ return; }; - // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B + // Emits global block literal [[BLG9]] and invoke function [[INVG9]]. + // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B void (^const block_B)(local void *) = ^(local void *a) { return; }; - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG8]] and block kernel [[INVGK8]]. [[INVGK8]] calls [[INVG8]]. + // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue + // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags + // COMMON-LABEL: call i32 @__enqueue_kernel_basic( + // COMMON-SAME: %opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK8:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + enqueue_kernel(default_queue, flags, ndrange, block_A); + + // Uses block kernel [[INVGK8]] and global block literal [[BLG8]]. + // COMMON: call i32 @__get_kernel_work_group_size_impl( + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK8]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) unsigned size = get_kernel_work_group_size(block_A); - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG9]] and block kernel [[INVGK9]]. [[INVGK9]] calls [[INV9]]. + // COMMON: call i32 @__get_kernel_work_group_size_impl( + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK9:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_work_group_size(block_B); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8]] and block kernel [[INVGK8]]. Make sure no redundant block literal ind invoke functions are emitted. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl( + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK8]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_A); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BL_GLOBAL]] and block kernel [[INV_G_K]]. [[INV_G_K]] calls [[INV_G]]. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl( + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INV_G_K:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_G); - // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Emits global block literal [[BLG10]] and block kernel [[INVGK10]]. + // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{[^,]+}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK10:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG10]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){}); - // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Emits global block literal [[BLG11]] and block kernel [[INVGK11]]. + // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{[^,]+}}, + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8* bitcast ({{.*}} [[INVGK11:[^ ]+_kernel]] to i8*) to i8 addrspace(4)*), + // COMMON-SAME: i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG11]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_sub_group_count_for_ndrange(ndrange, ^(){}); } + +// COMMON: define internal spir_kernel void [[INVLK1]](i8 addrspace(4)*) #{{[0-9]+}} { +// COMMON: entry: +// COMMON: call void @__device_side_enqueue_block_invoke(i8 addrspace(4)* %0) +// COMMON: ret void +// COMMON: } +// COMMON: define internal spir_kernel void [[INVLK2]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK1]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK2]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK3]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK4]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK5]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK6]](i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*) #{{[0-9]+}} { +// COMMON: entry: +// COMMON: call void @__device_side_enqueue_block_invoke_8(i8 addrspace(4)* %0, i8 addrspace(3)* %1, i8 addrspace(3)* %2, i8 addrspace(3)* %3) +// COMMON: ret void +// COMMON: } +// COMMON: define internal spir_kernel void [[INVGK7]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK8]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK10]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVGK11]](i8 addrspace(4)*{{.*}})