Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -2609,7 +2609,8 @@ Int32Ty, llvm::ArrayRef(ArgTys, 4), false); llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(3)), GenericVoidPtrTy); + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)), + GenericVoidPtrTy); AttrBuilder B; B.addAttribute(Attribute::ByVal); @@ -2650,8 +2651,9 @@ if (E->getArg(3)->getType()->isBlockPointerType()) { // No events passed, but has variadic arguments. Name = "__enqueue_kernel_vaargs"; - auto *Block = Builder.CreatePointerCast(EmitScalarExpr(E->getArg(3)), - GenericVoidPtrTy); + auto *Block = Builder.CreatePointerCast( + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)), + GenericVoidPtrTy); auto *PtrToSizeArray = CreateArrayForSizeVar(4); // Create a vector of the arguments, as well as a constant value to @@ -2689,7 +2691,8 @@ EventList = Builder.CreatePointerCast(EventList, EventPtrTy); ClkEvent = Builder.CreatePointerCast(ClkEvent, EventPtrTy); llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(6)), GenericVoidPtrTy); + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6)), + GenericVoidPtrTy); std::vector ArgTys = { QueueTy, Int32Ty, RangeTy, Int32Ty, @@ -2730,7 +2733,8 @@ case Builtin::BIget_kernel_work_group_size: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); + Value *Arg = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( @@ -2741,7 +2745,8 @@ case Builtin::BIget_kernel_preferred_work_group_size_multiple: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); + Value *Arg = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( @@ -2755,7 +2760,8 @@ getContext().getTargetAddressSpace(LangAS::opencl_generic)); LValue NDRangeL = EmitAggExprToLValue(E->getArg(0)); llvm::Value *NDRange = NDRangeL.getAddress().getPointer(); - Value *Block = EmitScalarExpr(E->getArg(1)); + Value *Block = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1)); Block = Builder.CreatePointerCast(Block, GenericVoidPtrTy); const char *Name = BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange Index: lib/CodeGen/CGOpenCLRuntime.h =================================================================== --- lib/CodeGen/CGOpenCLRuntime.h +++ lib/CodeGen/CGOpenCLRuntime.h @@ -17,11 +17,13 @@ #define LLVM_CLANG_LIB_CODEGEN_CGOPENCLRUNTIME_H #include "clang/AST/Type.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" namespace clang { +class Expr; class VarDecl; namespace CodeGen { @@ -34,6 +36,8 @@ CodeGenModule &CGM; llvm::Type *PipeTy; llvm::PointerType *SamplerTy; + /// Maps block expression to llvm value. + llvm::DenseMap EnqueuedBlockMap; public: CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr), @@ -62,6 +66,9 @@ /// \return __generic void* type. llvm::PointerType *getGenericVoidPointerType(); + + /// \return block literal for enqueued block. + llvm::Value *emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E); }; } Index: lib/CodeGen/CGOpenCLRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenCLRuntime.cpp +++ lib/CodeGen/CGOpenCLRuntime.cpp @@ -16,6 +16,7 @@ #include "CGOpenCLRuntime.h" #include "CodeGenFunction.h" #include "TargetInfo.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalValue.h" #include @@ -110,3 +111,95 @@ CGM.getLLVMContext(), CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic)); } + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { + // The block literal may be assigned to a const variable. Chasing down + // to get the block literal. + if (auto DR = dyn_cast(E)) { + E = cast(DR->getDecl())->getInit(); + } + if (auto Cast = dyn_cast(E)) { + E = Cast->getSubExpr(); + } + auto *Block = cast(E); + + // The same block literal may be enqueued multiple times. Cache it if + // possible. + bool Cacheable = !Block->getBlockDecl()->hasCaptures(); + if (Cacheable) { + auto Loc = EnqueuedBlockMap.find(Block); + if (Loc != EnqueuedBlockMap.end()) { + return Loc->second; + } + } + + // Emit block literal as a common block expression. There are two situations. + // If there is no capture, the block literal is emitted as a global variable, + // otherwise it is emitted as an alloca. In both case, it is emitted as a + // structure and the block invoke function is the third field. + auto *V = CGF.EmitScalarExpr(Block); + llvm::Function *F; + if (auto *I = dyn_cast(V)) { + // If the block literal is emitted as an instruction, it is an alloca + // and the block invoke function is stored to GEP of this alloca. + // Find the GEP and store instruction and replace the invoke function + // with a kernel. + auto *Block = I->stripPointerCasts(); + llvm::Function *Invoke = nullptr; + llvm::StoreInst *ST = nullptr; + llvm::GetElementPtrInst *GEP = nullptr; + for (; I; I = I->getPrevNode()) { + if ((ST = dyn_cast(I))) { + if ((GEP = + dyn_cast(ST->getPointerOperand()))) { + if (GEP->getPointerOperand() == Block && + cast(GEP->getOperand(2))->getZExtValue() == + 2) { + Invoke = cast( + ST->getValueOperand()->stripPointerCasts()); + break; + } + } + } + } + assert(ST && GEP && Invoke && "Invalid block invoke function"); + F = CGF.getTargetHooks().createEnqueuedBlockKernel(CGF, Invoke, Block); + ST->replaceUsesOfWith( + ST->getValueOperand(), + llvm::ConstantExpr::getPointerCast( + F, ST->getPointerOperand()->getType()->getPointerElementType())); + } else { + // If the block literal is emitted as a global variable, the block invoke + // function is in its initializer. Create a new global variable and replace + // the invoke function with a kernel. + auto *Block = cast( + cast(V)->stripPointerCasts()); + auto *BlockInit = cast(Block->getInitializer()); + auto *Invoke = cast( + BlockInit->getAggregateElement(2)->stripPointerCasts()); + F = CGF.getTargetHooks().createEnqueuedBlockKernel(CGF, Invoke, Block); + ConstantInitBuilder Builder(CGM); + auto Fields = Builder.beginStruct(BlockInit->getType()); + Fields.add(BlockInit->getAggregateElement(0U)); + Fields.add(BlockInit->getAggregateElement(1)); + Fields.add(llvm::ConstantExpr::getPointerCast( + F, BlockInit->getAggregateElement(2)->getType())); + Block = Fields.finishAndCreateGlobal( + "__opencl_enqueued_block_literal_global", + CharUnits::fromQuantity( + Block->getPointerAlignment(CGM.getDataLayout())), + /*Constant=*/true, llvm::GlobalValue::InternalLinkage, + Block->getType()->getPointerAddressSpace()); + V = llvm::ConstantExpr::getPointerCast(Block, V->getType()); + } + + // The common part of the post-processing of the kernel goes here. + F->addFnAttr(llvm::Attribute::NoUnwind); + F->setCallingConv( + CGF.getTypes().ClangCallConvToLLVMCallConv(CallingConv::CC_OpenCLKernel)); + if (Cacheable) { + EnqueuedBlockMap[Block] = V; + } + return V; +} Index: lib/CodeGen/CodeGenTypes.h =================================================================== --- lib/CodeGen/CodeGenTypes.h +++ lib/CodeGen/CodeGenTypes.h @@ -164,8 +164,6 @@ llvm::SmallSet RecordsWithOpaqueMemberPointers; - unsigned ClangCallConvToLLVMCallConv(CallingConv CC); - public: CodeGenTypes(CodeGenModule &cgm); ~CodeGenTypes(); @@ -180,6 +178,9 @@ llvm::LLVMContext &getLLVMContext() { return TheModule.getContext(); } const CodeGenOptions &getCodeGenOpts() const; + /// Convert clang calling convention to LLVM callilng convention. + unsigned ClangCallConvToLLVMCallConv(CallingConv CC); + /// ConvertType - Convert type T into a llvm::Type. llvm::Type *ConvertType(QualType T); Index: lib/CodeGen/TargetInfo.h =================================================================== --- lib/CodeGen/TargetInfo.h +++ lib/CodeGen/TargetInfo.h @@ -32,6 +32,7 @@ namespace clang { class Decl; +class ASTContext; namespace CodeGen { class ABIInfo; @@ -287,6 +288,11 @@ virtual TargetOpenCLBlockHelper *getTargetOpenCLBlockHelper() const { return nullptr; } + /// Create an OpenCL kernel for an enqueued block. + virtual llvm::Function * + createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *BlockInvokeFunc, + llvm::Value *BlockLiteral) const; }; } // namespace CodeGen Index: lib/CodeGen/TargetInfo.cpp =================================================================== --- lib/CodeGen/TargetInfo.cpp +++ lib/CodeGen/TargetInfo.cpp @@ -14,6 +14,7 @@ #include "TargetInfo.h" #include "ABIInfo.h" +#include "CGBlocks.h" #include "CGCXXABI.h" #include "CGValue.h" #include "CodeGenFunction.h" @@ -7617,6 +7618,10 @@ const VarDecl *D) const override; llvm::SyncScope::ID getLLVMSyncScopeID(SyncScope S, llvm::LLVMContext &C) const override; + llvm::Function * + createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *BlockInvokeFunc, + llvm::Value *BlockLiteral) const override; }; } @@ -8917,3 +8922,98 @@ return SetCGInfo(new SPIRTargetCodeGenInfo(Types)); } } + +llvm::Function * +TargetCodeGenInfo::createEnqueuedBlockKernel(CodeGenFunction &CGF, + llvm::Function *Invoke, + llvm::Value *BlockLiteral) const { + auto *InvokeFT = Invoke->getFunctionType(); + llvm::SmallVector ArgTys; + for (auto &P : InvokeFT->params()) + ArgTys.push_back(P); + auto &C = CGF.getLLVMContext(); + std::string Name = Invoke->getName().str() + "_kernel"; + auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); + auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, + &CGF.CGM.getModule()); + auto IP = CGF.Builder.saveIP(); + auto *BB = llvm::BasicBlock::Create(C, "entry", F); + auto &Builder = CGF.Builder; + Builder.SetInsertPoint(BB); + llvm::SmallVector Args; + for (auto &A : F->args()) + Args.push_back(&A); + Builder.CreateCall(Invoke, Args); + Builder.CreateRetVoid(); + Builder.restoreIP(IP); + return F; +} + +llvm::Function *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel( + CodeGenFunction &CGF, llvm::Function *Invoke, + llvm::Value *BlockLiteral) const { + auto &Builder = CGF.Builder; + auto &Context = CGF.getLLVMContext(); + + auto *BlockTy = BlockLiteral->getType()->getPointerElementType(); + auto *InvokeFT = Invoke->getFunctionType(); + llvm::SmallVector ArgTys; + llvm::SmallVector AddressQuals; + llvm::SmallVector AccessQuals; + llvm::SmallVector ArgTypeNames; + llvm::SmallVector ArgBaseTypeNames; + llvm::SmallVector ArgTypeQuals; + llvm::SmallVector ArgNames; + + ArgTys.push_back(BlockTy); + ArgTypeNames.push_back(llvm::MDString::get(Context, "__block_literal")); + AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0))); + ArgBaseTypeNames.push_back(llvm::MDString::get(Context, "__block_literal")); + ArgTypeQuals.push_back(llvm::MDString::get(Context, "")); + AccessQuals.push_back(llvm::MDString::get(Context, "none")); + ArgNames.push_back(llvm::MDString::get(Context, "block_literal")); + for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) { + ArgTys.push_back(InvokeFT->getParamType(I)); + ArgTys.push_back(BlockTy); + ArgTypeNames.push_back(llvm::MDString::get(Context, "void*")); + AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3))); + AccessQuals.push_back(llvm::MDString::get(Context, "none")); + ArgBaseTypeNames.push_back(llvm::MDString::get(Context, "void*")); + ArgTypeQuals.push_back(llvm::MDString::get(Context, "")); + ArgNames.push_back(llvm::MDString::get(Context, std::string("local_arg") + + std::to_string(I))); + } + auto &C = CGF.getLLVMContext(); + std::string Name = Invoke->getName().str() + "_kernel"; + auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false); + auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name, + &CGF.CGM.getModule()); + F->addFnAttr("enqueued_block"); + auto IP = CGF.Builder.saveIP(); + auto *BB = llvm::BasicBlock::Create(C, "entry", F); + Builder.SetInsertPoint(BB); + auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr); + Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, 4); + auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0)); + llvm::SmallVector Args; + Args.push_back(Cast); + for (auto I = F->arg_begin() + 1, E = F->arg_end(); I != E; ++I) + Args.push_back(I); + Builder.CreateCall(Invoke, Args); + Builder.CreateRetVoid(); + Builder.restoreIP(IP); + + F->setMetadata("kernel_arg_addr_space", + llvm::MDNode::get(Context, AddressQuals)); + F->setMetadata("kernel_arg_access_qual", + llvm::MDNode::get(Context, AccessQuals)); + F->setMetadata("kernel_arg_type", llvm::MDNode::get(Context, ArgTypeNames)); + F->setMetadata("kernel_arg_base_type", + llvm::MDNode::get(Context, ArgBaseTypeNames)); + F->setMetadata("kernel_arg_type_qual", + llvm::MDNode::get(Context, ArgTypeQuals)); + if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) + F->setMetadata("kernel_arg_name", llvm::MDNode::get(Context, ArgNames)); + + return F; +} Index: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl =================================================================== --- /dev/null +++ test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK + +typedef struct {int a;} ndrange_t; + +// CHECK-LABEL: define amdgpu_kernel void @test +kernel void test(global char *a, char b, global long *c, long d) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + }); + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + c[0] = d; + }); +} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>) +// CHECK-SAME: #[[ATTR:[0-9]+]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>) +// CHECK-SAME: #[[ATTR]] !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// CHECK: attributes #[[ATTR]] = { nounwind "enqueued_block" } Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl =================================================================== --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,33 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BL_GLOBAL_K:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G_K:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG10:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG10:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG11:@__opencl_enqueued_block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG11:@[^ ]+_kernel]] to i8*) to i8 addrspace(4)*) } + +// Emits block literal [[BL_GLOBAL]], invoke function [[INV_G]] and global block variable @block_G +// COMMON: define internal spir_func void [[INV_G]](i8 addrspace(4)* %{{.*}}, i8 addrspace(3)* %{{.*}}) const bl_t block_G = (bl_t) ^ (local void *a) {}; +// COMMON-LABEL: define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %{{.*}}, i32 addrspace(1)* %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: %default_queue = alloca %opencl.queue_t* queue_t default_queue; @@ -24,62 +47,75 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; + // Emits block literal on stack and invoke function [[INVL1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]]) enqueue_kernel(default_queue, flags, ndrange, ^(void) { a[i] = b[i]; }); + // Emits block literal on stack and invoke function [[INVL2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)* // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)* + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVL2:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic_events + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]]) enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, ^(void) { a[i] = b[i]; }); + // Emits global block literal [[BLG1]] and invoke function [[INVG1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 256); char c; + // Emits global block literal [[BLG2]] and invoke function [[INVG2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, c); + // Emits global block literal [[BLG3]] and invoke function [[INVG3]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -88,17 +124,20 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; }, 256); + // Emits global block literal [[BLG4]] and invoke function [[INVG4]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -107,11 +146,13 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; @@ -119,22 +160,26 @@ c); long l; + // Emits global block literal [[BLG5]] and invoke function [[INVG5]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, l); + // Emits global block literal [[BLG6]] and invoke function [[INVG6]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [3 x i32] @@ -144,7 +189,8 @@ // B32: store i32 2, i32* %[[TMP2]], align 4 // B32: %[[TMP3:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 2 // B32: store i32 4, i32* %[[TMP3]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [3 x i64] // B64: %[[TMP1:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 1, i64* %[[TMP1]], align 8 @@ -152,52 +198,104 @@ // B64: store i64 2, i64* %[[TMP2]], align 8 // B64: %[[TMP3:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 2 // B64: store i64 4, i64* %[[TMP3]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p1, local void *p2, local void *p3) { return; }, 1, 2, 4); + // Emits global block literal [[BLG7]] and invoke function [[INVG7]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 0, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 4294967296, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__opencl_enqueued_block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 4294967296L); + // Emits global block literal [[BLG8]] and invoke function [[INVG8]]. // The full type of these expressions are long (and repeated elsewhere), so we // capture it as part of the regex for convenience and clarity. - // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A:@__block_literal_global(\.[0-9]+)?]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A + // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A void (^const block_A)(void) = ^{ return; }; - // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B + // Emits global block literal [[BLG9]] and invoke function [[INVG9]]. + // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B void (^const block_B)(local void *) = ^(local void *a) { return; }; - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG8K]] and invoke function [[INVG8K]]. [[INVG8K]] is the same as [[INV8]] except calling convention, ABI and metadata. + // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue + // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags + // COMMON-LABEL: call i32 @__enqueue_kernel_basic + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + enqueue_kernel(default_queue, flags, ndrange, block_A); + + // Uses global block literal [[BLG8K]]. + // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) unsigned size = get_kernel_work_group_size(block_A); - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG9K]] and invoke function [[INVG9K]]. [[INVG9K]] is the same as [[INV9]] except calling convention, ABI and metadata. + // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_work_group_size(block_B); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8K]] and invoke function [[INVG8K]]. Make sure no redundant block literal ind invoke functions are emitted. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_A); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BL_GLOBAL_K]] and invoke function [[INV_G_K]]. [[INV_G_K]] is the same as [[INV_G]] except calling convention, ABI and metadata. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL_K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_G); - // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Emits global block literal [[BLG10]] and invoke function [[INVG10]]. + // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG10]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){}); - // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Emits global block literal [[BLG11]] and invoke function [[INVG11]]. + // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG11]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_sub_group_count_for_ndrange(ndrange, ^(){}); } + +// COMMON: define internal spir_kernel void [[INVL1]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVL2]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG1]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG2]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG3]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG4]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG5]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG6]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG7]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)* %{{.*}}) +// COMMON: define internal spir_kernel void [[INVG8K]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG9K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(4)*{{.*}}, i8 addrspace(3)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG10]](i8 addrspace(4)*{{.*}}) +// COMMON: define internal spir_kernel void [[INVG11]](i8 addrspace(4)*{{.*}})