Index: lib/CodeGen/CGBlocks.h =================================================================== --- lib/CodeGen/CGBlocks.h +++ lib/CodeGen/CGBlocks.h @@ -258,6 +258,9 @@ /// has been encountered. CGBlockInfo *NextBlockInfo; + /// The block is emitted as an OpenCL devide-side kernel. + bool AsOpenCLKernel; + const Capture &getCapture(const VarDecl *var) const { return const_cast(this)->getCapture(var); } @@ -275,6 +278,9 @@ return BlockExpression; } + void setAsOpenCLKernel(bool Yes) { AsOpenCLKernel = Yes; } + bool asOpenCLKernel() const { return AsOpenCLKernel; } + CGBlockInfo(const BlockDecl *blockDecl, StringRef Name); }; Index: lib/CodeGen/CGBlocks.cpp =================================================================== --- lib/CodeGen/CGBlocks.cpp +++ lib/CodeGen/CGBlocks.cpp @@ -18,8 +18,9 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "ConstantEmitter.h" -#include "clang/CodeGen/ConstantInitBuilder.h" +#include "TargetInfo.h" #include "clang/AST/DeclObjC.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/ADT/SmallSet.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/DataLayout.h" @@ -31,10 +32,10 @@ using namespace CodeGen; CGBlockInfo::CGBlockInfo(const BlockDecl *block, StringRef name) - : Name(name), CXXThisIndex(0), CanBeGlobal(false), NeedsCopyDispose(false), - HasCXXObject(false), UsesStret(false), HasCapturedVariableLayout(false), - LocalAddress(Address::invalid()), StructureType(nullptr), Block(block), - DominatingIP(nullptr) { + : Name(name), CXXThisIndex(0), CanBeGlobal(false), NeedsCopyDispose(false), + HasCXXObject(false), UsesStret(false), HasCapturedVariableLayout(false), + LocalAddress(Address::invalid()), StructureType(nullptr), Block(block), + DominatingIP(nullptr), AsOpenCLKernel(false) { // Skip asm prefix, if any. 'name' is usually taken directly from // the mangled name of the enclosing function. @@ -715,15 +716,18 @@ } /// Emit a block literal expression in the current function. -llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr) { +llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr, + bool AsOpenCLKernel) { // If the block has no captures, we won't have a pre-computed // layout for it. if (!blockExpr->getBlockDecl()->hasCaptures()) { - if (llvm::Constant *Block = CGM.getAddrOfGlobalBlockIfEmitted(blockExpr)) - return Block; + if (!AsOpenCLKernel) + if (llvm::Constant *Block = CGM.getAddrOfGlobalBlockIfEmitted(blockExpr)) + return Block; CGBlockInfo blockInfo(blockExpr->getBlockDecl(), CurFn->getName()); computeBlockInfo(CGM, this, blockInfo); blockInfo.BlockExpression = blockExpr; + blockInfo.setAsOpenCLKernel(AsOpenCLKernel); return EmitBlockLiteral(blockInfo); } @@ -733,6 +737,7 @@ blockExpr->getBlockDecl())); blockInfo->BlockExpression = blockExpr; + blockInfo->setAsOpenCLKernel(AsOpenCLKernel); return EmitBlockLiteral(*blockInfo); } @@ -745,10 +750,8 @@ CGM.getTarget().getPointerWidth(GenVoidPtrAddr) / 8); // Using the computed layout, generate the actual block function. bool isLambdaConv = blockInfo.getBlockDecl()->isConversionFromLambda(); - llvm::Constant *blockFn - = CodeGenFunction(CGM, true).GenerateBlockFunction(CurGD, blockInfo, - LocalDeclMap, - isLambdaConv); + llvm::Constant *blockFn = CodeGenFunction(CGM, true).GenerateBlockFunction( + CurGD, blockInfo, LocalDeclMap, isLambdaConv, blockInfo.AsOpenCLKernel); blockFn = llvm::ConstantExpr::getPointerCast(blockFn, GenVoidPtrTy); // If there is nothing to capture, we can emit this as a global block. @@ -1164,10 +1167,8 @@ llvm::Constant *blockFn; { CodeGenFunction::DeclMapTy LocalDeclMap; - blockFn = CodeGenFunction(*this).GenerateBlockFunction(GlobalDecl(), - blockInfo, - LocalDeclMap, - false); + blockFn = CodeGenFunction(*this).GenerateBlockFunction( + GlobalDecl(), blockInfo, LocalDeclMap, false, false); } auto GenVoidPtrTy = getContext().getLangOpts().OpenCL ? getOpenCLRuntime().getGenericVoidPointerType() @@ -1185,7 +1186,8 @@ // Callers should detect this case on their own: calling this function // generally requires computing layout information, which is a waste of time // if we've already emitted this block. - assert(!CGM.getAddrOfGlobalBlockIfEmitted(blockInfo.BlockExpression) && + assert((blockInfo.asOpenCLKernel() || + !CGM.getAddrOfGlobalBlockIfEmitted(blockInfo.BlockExpression)) && "Refusing to re-emit a global block."); // Generate the constants for the block literal initializer. @@ -1232,19 +1234,29 @@ CGM.getTypes().ConvertType(blockInfo.getBlockExpr()->getType()); llvm::Constant *Result = llvm::ConstantExpr::getPointerCast(literal, RequiredType); - CGM.setAddrOfGlobalBlock(blockInfo.BlockExpression, Result); + if (!blockInfo.asOpenCLKernel()) + CGM.setAddrOfGlobalBlock(blockInfo.BlockExpression, Result); return Result; } void CodeGenFunction::setBlockContextParameter(const ImplicitParamDecl *D, unsigned argNum, - llvm::Value *arg) { + const ParamValue &PV) { assert(BlockInfo && "not emitting prologue of block invocation function?!"); + auto &C = getContext(); + auto *arg = PV.getAnyValue(); + // On certain target OpenCL struct type kernel argument is passed directly, + // therefore the block literal argument is not a pointer. + bool IsPtr = isa(arg->getType()); llvm::Value *localAddr = nullptr; - if (CGM.getCodeGenOpts().OptimizationLevel == 0) { + if (CGM.getCodeGenOpts().OptimizationLevel == 0 || !IsPtr) { // Allocate a stack slot to let the debug info survive the RA. - Address alloc = CreateMemTemp(D->getType(), D->getName() + ".addr"); + Address alloc = CreateMemTemp( + !PV.isIndirect() ? D->getType() + : C.getPointerType(C.getAddrSpaceQualType( + D->getType(), getASTAllocaAddressSpace())), + D->getName() + ".addr"); Builder.CreateStore(arg, alloc); localAddr = Builder.CreateLoad(alloc); } @@ -1261,15 +1273,14 @@ SourceLocation StartLoc = BlockInfo->getBlockExpr()->getBody()->getLocStart(); ApplyDebugLocation Scope(*this, StartLoc); + assert(BlockInfo->asOpenCLKernel() || IsPtr); // Instead of messing around with LocalDeclMap, just set the value // directly as BlockPointer. - BlockPointer = Builder.CreatePointerCast( - arg, - BlockInfo->StructureType->getPointerTo( - getContext().getLangOpts().OpenCL - ? getContext().getTargetAddressSpace(LangAS::opencl_generic) - : 0), - "block"); + BlockPointer = + Builder.CreatePointerCast(IsPtr ? arg : localAddr, + BlockInfo->StructureType->getPointerTo( + CGM.getDataLayout().getAllocaAddrSpace()), + "block"); } Address CodeGenFunction::LoadBlockStruct() { @@ -1278,11 +1289,9 @@ return Address(BlockPointer, BlockInfo->BlockAlign); } -llvm::Function * -CodeGenFunction::GenerateBlockFunction(GlobalDecl GD, - const CGBlockInfo &blockInfo, - const DeclMapTy &ldm, - bool IsLambdaConversionToBlock) { +llvm::Function *CodeGenFunction::GenerateBlockFunction( + GlobalDecl GD, const CGBlockInfo &blockInfo, const DeclMapTy &ldm, + bool IsLambdaConversionToBlock, bool AsOpenCLKernel) { const BlockDecl *blockDecl = blockInfo.getBlockDecl(); CurGD = GD; @@ -1309,13 +1318,9 @@ // and cast it later. QualType selfTy = getContext().VoidPtrTy; - // For OpenCL passed block pointer can be private AS local variable or - // global AS program scope variable (for the case with and without captures). - // Generic AS is used therefore to be able to accommodate both private and - // generic AS in one implementation. - if (getLangOpts().OpenCL) - selfTy = getContext().getPointerType(getContext().getAddrSpaceQualType( - getContext().VoidTy, LangAS::opencl_generic)); + if (blockInfo.asOpenCLKernel()) + selfTy = CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + getContext(), blockInfo); IdentifierInfo *II = &CGM.getContext().Idents.get(".block_descriptor"); @@ -1329,8 +1334,8 @@ // Create the function declaration. const FunctionProtoType *fnType = blockInfo.getBlockExpr()->getFunctionType(); - const CGFunctionInfo &fnInfo = - CGM.getTypes().arrangeBlockFunctionDeclaration(fnType, args); + const CGFunctionInfo &fnInfo = CGM.getTypes().arrangeBlockFunctionDeclaration( + fnType, args, blockInfo.asOpenCLKernel()); if (CGM.ReturnSlotInterferesWithArgs(fnInfo)) blockInfo.UsesStret = true; Index: lib/CodeGen/CGBuiltin.cpp =================================================================== --- lib/CodeGen/CGBuiltin.cpp +++ lib/CodeGen/CGBuiltin.cpp @@ -2609,7 +2609,8 @@ Int32Ty, llvm::ArrayRef(ArgTys, 4), false); llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(3)), GenericVoidPtrTy); + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)), + GenericVoidPtrTy); AttrBuilder B; B.addAttribute(Attribute::ByVal); @@ -2650,8 +2651,9 @@ if (E->getArg(3)->getType()->isBlockPointerType()) { // No events passed, but has variadic arguments. Name = "__enqueue_kernel_vaargs"; - auto *Block = Builder.CreatePointerCast(EmitScalarExpr(E->getArg(3)), - GenericVoidPtrTy); + auto *Block = Builder.CreatePointerCast( + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(3)), + GenericVoidPtrTy); auto *PtrToSizeArray = CreateArrayForSizeVar(4); // Create a vector of the arguments, as well as a constant value to @@ -2689,7 +2691,8 @@ EventList = Builder.CreatePointerCast(EventList, EventPtrTy); ClkEvent = Builder.CreatePointerCast(ClkEvent, EventPtrTy); llvm::Value *Block = Builder.CreatePointerCast( - EmitScalarExpr(E->getArg(6)), GenericVoidPtrTy); + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(6)), + GenericVoidPtrTy); std::vector ArgTys = { QueueTy, Int32Ty, RangeTy, Int32Ty, @@ -2730,7 +2733,8 @@ case Builtin::BIget_kernel_work_group_size: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); + Value *Arg = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( @@ -2741,7 +2745,8 @@ case Builtin::BIget_kernel_preferred_work_group_size_multiple: { llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy( getContext().getTargetAddressSpace(LangAS::opencl_generic)); - Value *Arg = EmitScalarExpr(E->getArg(0)); + Value *Arg = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(0)); Arg = Builder.CreatePointerCast(Arg, GenericVoidPtrTy); return RValue::get(Builder.CreateCall( CGM.CreateRuntimeFunction( @@ -2755,7 +2760,8 @@ getContext().getTargetAddressSpace(LangAS::opencl_generic)); LValue NDRangeL = EmitAggExprToLValue(E->getArg(0)); llvm::Value *NDRange = NDRangeL.getAddress().getPointer(); - Value *Block = EmitScalarExpr(E->getArg(1)); + Value *Block = + CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1)); Block = Builder.CreatePointerCast(Block, GenericVoidPtrTy); const char *Name = BuiltinID == Builtin::BIget_kernel_max_sub_group_size_for_ndrange Index: lib/CodeGen/CGCall.cpp =================================================================== --- lib/CodeGen/CGCall.cpp +++ lib/CodeGen/CGCall.cpp @@ -608,14 +608,17 @@ const CGFunctionInfo & CodeGenTypes::arrangeBlockFunctionDeclaration(const FunctionProtoType *proto, - const FunctionArgList ¶ms) { + const FunctionArgList ¶ms, + bool AsOpenCLKernel) { auto paramInfos = getExtParameterInfosForCall(proto, 1, params.size()); auto argTypes = getArgTypesForDeclaration(Context, params); + auto Info = proto->getExtInfo(); + if (AsOpenCLKernel) + Info = Info.withCallingConv(CC_OpenCLKernel); return arrangeLLVMFunctionInfo( GetReturnType(proto->getReturnType()), - /*instanceMethod*/ false, /*chainCall*/ false, argTypes, - proto->getExtInfo(), paramInfos, + /*instanceMethod*/ false, /*chainCall*/ false, argTypes, Info, paramInfos, RequiredArgs::forPrototypePlus(proto, 1, nullptr)); } Index: lib/CodeGen/CGDecl.cpp =================================================================== --- lib/CodeGen/CGDecl.cpp +++ lib/CodeGen/CGDecl.cpp @@ -1787,7 +1787,8 @@ // The only implicit argument a block has is its literal. // We assume this is always passed directly. if (BlockInfo) { - setBlockContextParameter(IPD, ArgNo, Arg.getDirectValue()); + assert(BlockInfo->asOpenCLKernel() || !Arg.isIndirect()); + setBlockContextParameter(IPD, ArgNo, Arg); return; } Index: lib/CodeGen/CGOpenCLRuntime.h =================================================================== --- lib/CodeGen/CGOpenCLRuntime.h +++ lib/CodeGen/CGOpenCLRuntime.h @@ -17,11 +17,13 @@ #define LLVM_CLANG_LIB_CODEGEN_CGOPENCLRUNTIME_H #include "clang/AST/Type.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" namespace clang { +class Expr; class VarDecl; namespace CodeGen { @@ -34,6 +36,8 @@ CodeGenModule &CGM; llvm::Type *PipeTy; llvm::PointerType *SamplerTy; + /// Maps block expression to llvm value. + llvm::DenseMap EnqueuedBlockMap; public: CGOpenCLRuntime(CodeGenModule &CGM) : CGM(CGM), PipeTy(nullptr), @@ -62,6 +66,9 @@ /// \return __generic void* type. llvm::PointerType *getGenericVoidPointerType(); + + /// \return block literal for enqueued block. + llvm::Value *emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, const Expr *E); }; } Index: lib/CodeGen/CGOpenCLRuntime.cpp =================================================================== --- lib/CodeGen/CGOpenCLRuntime.cpp +++ lib/CodeGen/CGOpenCLRuntime.cpp @@ -109,3 +109,26 @@ CGM.getLLVMContext(), CGM.getContext().getTargetAddressSpace(LangAS::opencl_generic)); } + +llvm::Value *CGOpenCLRuntime::emitOpenCLEnqueuedBlock(CodeGenFunction &CGF, + const Expr *E) { + if (auto DR = dyn_cast(E)) { + E = cast(DR->getDecl())->getInit(); + } + if (auto Cast = dyn_cast(E)) { + E = Cast->getSubExpr(); + } + auto *Block = cast(E); + bool Cacheable = !Block->getBlockDecl()->hasCaptures(); + if (Cacheable) { + auto Loc = EnqueuedBlockMap.find(Block); + if (Loc != EnqueuedBlockMap.end()) { + return Loc->second; + } + } + auto *V = CGF.EmitBlockLiteral(Block, true); + if (Cacheable) { + EnqueuedBlockMap[Block] = V; + } + return V; +} Index: lib/CodeGen/CodeGenFunction.h =================================================================== --- lib/CodeGen/CodeGenFunction.h +++ lib/CodeGen/CodeGenFunction.h @@ -1430,8 +1430,7 @@ /// Add OpenCL kernel arg metadata and the kernel attribute meatadata to /// the function metadata. - void EmitOpenCLKernelMetadata(const FunctionDecl *FD, - llvm::Function *Fn); + void EmitOpenCLKernelMetadata(const Decl *FD, llvm::Function *Fn); public: CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext=false); @@ -1580,13 +1579,13 @@ // Block Bits //===--------------------------------------------------------------------===// - llvm::Value *EmitBlockLiteral(const BlockExpr *); + llvm::Value *EmitBlockLiteral(const BlockExpr *, bool AsOpenCLKernel = false); static void destroyBlockInfos(CGBlockInfo *info); - llvm::Function *GenerateBlockFunction(GlobalDecl GD, - const CGBlockInfo &Info, + llvm::Function *GenerateBlockFunction(GlobalDecl GD, const CGBlockInfo &Info, const DeclMapTy &ldm, - bool IsLambdaConversionToBlock); + bool IsLambdaConversionToBlock, + bool AsOpenCLKernel); llvm::Constant *GenerateCopyHelperFunction(const CGBlockInfo &blockInfo); llvm::Constant *GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo); @@ -1603,8 +1602,9 @@ void emitByrefStructureInit(const AutoVarEmission &emission); void enterByrefCleanup(const AutoVarEmission &emission); + class ParamValue; void setBlockContextParameter(const ImplicitParamDecl *D, unsigned argNum, - llvm::Value *ptr); + const ParamValue &PV); Address LoadBlockStruct(); Address GetAddrOfBlockDecl(const VarDecl *var, bool ByRef); Index: lib/CodeGen/CodeGenFunction.cpp =================================================================== --- lib/CodeGen/CodeGenFunction.cpp +++ lib/CodeGen/CodeGenFunction.cpp @@ -494,9 +494,11 @@ // OpenCL v1.2 s5.6.4.6 allows the compiler to store kernel argument // information in the program executable. The argument information stored // includes the argument name, its type, the address and access qualifiers used. -static void GenOpenCLArgMetadata(const FunctionDecl *FD, llvm::Function *Fn, - CodeGenModule &CGM, llvm::LLVMContext &Context, +static void GenOpenCLArgMetadata(const Decl *D, llvm::Function *Fn, + CodeGenFunction &CGF, + llvm::LLVMContext &Context, CGBuilderTy &Builder, ASTContext &ASTCtx) { + assert((isa(D) || isa(D))); // Create MDNodes that represent the kernel arg metadata. // Each MDNode is a list in the form of "key", N number of values which is // the same number of values as their are kernel arguments. @@ -521,10 +523,23 @@ // MDNode for the kernel argument names. SmallVector argNames; - for (unsigned i = 0, e = FD->getNumParams(); i != e; ++i) { - const ParmVarDecl *parm = FD->getParamDecl(i); - QualType ty = parm->getType(); + bool IsBlock = isa(D); + auto Params = IsBlock ? cast(D)->parameters() + : cast(D)->parameters(); + for (unsigned i = 0, e = IsBlock ? Params.size() + 1 : Params.size(); i != e; + ++i) { std::string typeQuals; + QualType ty; + StringRef Name; + if (i == 0 && IsBlock) { + ty = CGF.CGM.getTargetCodeGenInfo().getEnqueuedBlockArgumentType( + ASTCtx, *CGF.BlockInfo); + Name = "block_context"; + } else { + auto *Arg = Params[IsBlock ? i - 1 : i]; + ty = Arg->getType(); + Name = Arg->getName(); + } if (ty->isPointerType()) { QualType pointeeTy = ty->getPointeeType(); @@ -621,7 +636,8 @@ // Get image and pipe access qualifier: if (ty->isImageType()|| ty->isPipeType()) { - const Decl *PDecl = parm; + // Only normal kernel function can have image and pipe type arguments. + const Decl *PDecl = cast(D)->getParamDecl(i); if (auto *TD = dyn_cast(ty)) PDecl = TD->getDecl(); const OpenCLAccessAttr *A = PDecl->getAttr(); @@ -635,7 +651,7 @@ accessQuals.push_back(llvm::MDString::get(Context, "none")); // Get argument name. - argNames.push_back(llvm::MDString::get(Context, parm->getName())); + argNames.push_back(llvm::MDString::get(Context, Name)); } Fn->setMetadata("kernel_arg_addr_space", @@ -648,20 +664,18 @@ llvm::MDNode::get(Context, argBaseTypeNames)); Fn->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(Context, argTypeQuals)); - if (CGM.getCodeGenOpts().EmitOpenCLArgMetadata) + if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata) Fn->setMetadata("kernel_arg_name", llvm::MDNode::get(Context, argNames)); } -void CodeGenFunction::EmitOpenCLKernelMetadata(const FunctionDecl *FD, - llvm::Function *Fn) -{ - if (!FD->hasAttr()) - return; +void CodeGenFunction::EmitOpenCLKernelMetadata(const Decl *FD, + llvm::Function *Fn) { + assert(isa(FD) || isa(FD)); llvm::LLVMContext &Context = getLLVMContext(); - GenOpenCLArgMetadata(FD, Fn, CGM, Context, Builder, getContext()); + GenOpenCLArgMetadata(FD, Fn, *this, Context, Builder, getContext()); if (const VecTypeHintAttr *A = FD->getAttr()) { QualType HintQTy = A->getTypeHint(); @@ -842,11 +856,9 @@ if (CGM.getCodeGenOpts().ProfileSampleAccurate) Fn->addFnAttr("profile-sample-accurate"); - if (getLangOpts().OpenCL) { - // Add metadata for a kernel function. - if (const FunctionDecl *FD = dyn_cast_or_null(D)) - EmitOpenCLKernelMetadata(FD, Fn); - } + // Add metadata for a kernel function. + if (FnInfo.getASTCallingConvention() == CC_OpenCLKernel) + EmitOpenCLKernelMetadata(D, Fn); // If we are checking function types, emit a function type signature as // prologue data. Index: lib/CodeGen/CodeGenTypes.h =================================================================== --- lib/CodeGen/CodeGenTypes.h +++ lib/CodeGen/CodeGenTypes.h @@ -291,9 +291,10 @@ const CallArgList &args); /// Block invocation functions are C functions with an implicit parameter. - const CGFunctionInfo &arrangeBlockFunctionDeclaration( - const FunctionProtoType *type, - const FunctionArgList &args); + const CGFunctionInfo & + arrangeBlockFunctionDeclaration(const FunctionProtoType *type, + const FunctionArgList &args, + bool AsOpenCLKernel = false); const CGFunctionInfo &arrangeBlockFunctionCall(const CallArgList &args, const FunctionType *type); Index: lib/CodeGen/TargetInfo.h =================================================================== --- lib/CodeGen/TargetInfo.h +++ lib/CodeGen/TargetInfo.h @@ -32,11 +32,13 @@ namespace clang { class Decl; +class ASTContext; namespace CodeGen { class ABIInfo; class CallArgList; class CodeGenFunction; +class CGBlockInfo; class CGFunctionInfo; /// TargetCodeGenInfo - This class organizes various target-specific @@ -265,6 +267,10 @@ /// Get the syncscope used in LLVM IR. virtual llvm::SyncScope::ID getLLVMSyncScopeID(SyncScope S, llvm::LLVMContext &C) const; + + /// Get the QualType for the block argument of an enqueued block. + virtual QualType getEnqueuedBlockArgumentType(ASTContext &C, + const CGBlockInfo &Info) const; }; } // namespace CodeGen Index: lib/CodeGen/TargetInfo.cpp =================================================================== --- lib/CodeGen/TargetInfo.cpp +++ lib/CodeGen/TargetInfo.cpp @@ -14,6 +14,7 @@ #include "TargetInfo.h" #include "ABIInfo.h" +#include "CGBlocks.h" #include "CGCXXABI.h" #include "CGValue.h" #include "CodeGenFunction.h" @@ -7624,6 +7625,8 @@ const VarDecl *D) const override; llvm::SyncScope::ID getLLVMSyncScopeID(SyncScope S, llvm::LLVMContext &C) const override; + QualType getEnqueuedBlockArgumentType(ASTContext &C, + const CGBlockInfo &Info) const override; }; } @@ -8924,3 +8927,51 @@ return SetCGInfo(new SPIRTargetCodeGenInfo(Types)); } } + +QualType +TargetCodeGenInfo::getEnqueuedBlockArgumentType(ASTContext &C, + const CGBlockInfo &Info) const { + return C.getPointerType( + C.getAddrSpaceQualType(C.VoidTy, LangAS::opencl_global)); +} + +QualType AMDGPUTargetCodeGenInfo::getEnqueuedBlockArgumentType( + ASTContext &C, const CGBlockInfo &Info) const { + unsigned Size = Info.BlockSize.getQuantity(); + unsigned Align = Info.BlockAlign.getQuantity(); + assert(Align == 4 || Align == 8); + // Create a struct type which has the same size and alignment as the block + // argument. + RecordDecl *RD = C.buildImplicitRecord("__amdgpu_block_arg_t"); + RD->startDefinition(); + auto AddField = [&](unsigned ElemSize, unsigned NumElem) { + assert(ElemSize == 4 || ElemSize == 8 || ElemSize == 1); + llvm::APInt ArraySize(C.getTargetInfo().getIntWidth(), NumElem); + QualType ElemType; + switch (ElemSize) { + case 8: + ElemType = C.LongTy; + break; + case 4: + ElemType = C.IntTy; + break; + case 1: + ElemType = C.CharTy; + break; + default: + llvm_unreachable("invalid element size"); + } + const QualType FieldTy = C.getConstantArrayType( + ElemType, ArraySize, ArrayType::Normal, /*TypeQualifiers=*/0); + auto *Field = FieldDecl::Create(C, RD, SourceLocation(), SourceLocation(), + &C.Idents.get("data"), FieldTy, + /*TInfo=*/nullptr, /*BitWidth=*/nullptr, + /*Mutable=*/false, ICIS_NoInit); + RD->addDecl(Field); + }; + AddField(Align, Size / Align); + if (unsigned Rem = Size % Align) + AddField(1, Rem); + RD->completeDefinition(); + return C.getTagDeclType(RD); +} Index: test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl =================================================================== --- /dev/null +++ test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -0,0 +1,42 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -emit-llvm -o - -triple amdgcn | FileCheck %s --check-prefix=CHECK + +// CHECK: %[[S1:struct.__amdgpu_block_arg_t.*]] = type { [3 x i64], [1 x i8] } +// CHECK: %[[S2:struct.__amdgpu_block_arg_t.*]] = type { [5 x i64], [1 x i8] } +// CHECK: %[[S3:struct.__amdgpu_block_arg_t.*]] = type { [2 x i64] } + +typedef struct {int a;} ndrange_t; + +// CHECK-LABEL: define amdgpu_kernel void @test +kernel void test(global char *a, char b, global long *c, long d) { + queue_t default_queue; + unsigned flags = 0; + ndrange_t ndrange; + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + }); + + enqueue_kernel(default_queue, flags, ndrange, + ^(void) { + a[0] = b; + c[0] = d; + }); + enqueue_kernel(default_queue, flags, ndrange, + ^(local void *a) { + local int *p = (local int *)a; + p[0] = 1; + }, + 100); +} + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke +// CHECK-SAME: (%[[S1]] %{{.*}}) +// CHECK: %block = bitcast %[[S1]]* %{{.*}} to <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_2 +// CHECK-SAME: (%[[S2]] %{{.*}}) +// CHECK: %block = bitcast %[[S2]]* %{{.*}} to <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* + +// CHECK-LABEL: define internal amdgpu_kernel void @__test_block_invoke_3 +// CHECK-SAME: (%[[S3]] %{{.*}}, i8 addrspace(3)* %a) #{{[0-9]+}} !kernel_arg_addr_space ![[MD:[0-9]+]] !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} Index: test/CodeGenOpenCL/blocks.cl =================================================================== --- test/CodeGenOpenCL/blocks.cl +++ test/CodeGenOpenCL/blocks.cl @@ -44,6 +44,6 @@ } // COMMON-LABEL: define internal {{.*}}i32 @__foo_block_invoke(i8 addrspace(4)* %.block_descriptor) -// COMMON: %[[block:.*]] = bitcast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 }> addrspace(4)* -// COMMON: %[[block_capture_addr:.*]] = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }> addrspace(4)* %[[block]], i32 0, i32 3 -// COMMON: %[[r1:.*]] = load i32, i32 addrspace(4)* %[[block_capture_addr]] +// COMMON: %[[block:.*]] = addrspacecast i8 addrspace(4)* %.block_descriptor to <{ i32, i32, i8 addrspace(4)*, i32 }>* +// COMMON: %[[block_capture_addr:.*]] = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i32 }>, <{ i32, i32, i8 addrspace(4)*, i32 }>* %[[block]], i32 0, i32 3 +// COMMON: %[[r1:.*]] = load i32, i32* %[[block_capture_addr]] Index: test/CodeGenOpenCL/cl20-device-side-enqueue.cl =================================================================== --- test/CodeGenOpenCL/cl20-device-side-enqueue.cl +++ test/CodeGenOpenCL/cl20-device-side-enqueue.cl @@ -6,10 +6,33 @@ typedef void (^bl_t)(local void *); typedef struct {int a;} ndrange_t; -// N.B. The check here only exists to set BL_GLOBAL -// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) +// COMMON: %struct.__opencl_block_literal_generic = type { i32, i32, i8 addrspace(4)* } + +// For a block global variable, first emit the block literal as a global variable, then emit the block variable itself. +// COMMON: [[BL_GLOBAL:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INV_G:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: @block_G = addrspace(1) constant void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*) + +// For anonymous blocks without captures, emit block literals as global variable. +// COMMON: [[BLG1:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG1:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG2:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG2:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG3:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG3:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG4:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG4:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG5:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG5:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG6:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*, i8 addrspace(3)*, i8 addrspace(3)*)* [[INVG6:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG7:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG7:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*)* [[INVG8:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(4)*, i8 addrspace(3)*)* [[INVG9:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG8K:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*)* [[INVG8K:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG9K:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INVG9K:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BL_GLOBAL_K:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*, i8 addrspace(3)*)* [[INV_G_K:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG10:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*)* [[INVG10:@[^ ]+]] to i8*) to i8 addrspace(4)*) } +// COMMON: [[BLG11:@__block_literal_global[^ ]*]] = internal addrspace(1) constant { i32, i32, i8 addrspace(4)* } { i32 {{[0-9]+}}, i32 {{[0-9]+}}, i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*)* [[INVG11:@[^ ]+]] to i8*) to i8 addrspace(4)*) } + +// Emits block literal [[BL_GLOBAL]], invoke function [[INV_G]] and global block variable @block_G +// COMMON: define internal spir_func void [[INV_G]](i8 addrspace(4)* %.block_descriptor, i8 addrspace(3)* %a) const bl_t block_G = (bl_t) ^ (local void *a) {}; +// COMMON-LABEL: define spir_kernel void @device_side_enqueue(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %i) kernel void device_side_enqueue(global int *a, global int *b, int i) { // COMMON: %default_queue = alloca %opencl.queue_t* queue_t default_queue; @@ -24,62 +47,75 @@ // COMMON: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*] clk_event_t event_wait_list2[] = {clk_event}; + // Emits block literal on stack and invoke function [[INVL1]]. // COMMON: [[NDR:%[a-z0-9]+]] = alloca %struct.ndrange_t, align 4 // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*)* [[INVL1:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // B32: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32, i32 addrspace(1)* }>* %block to void ()* // B64: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32 addrspace(1)*, i32 addrspace(1)*, i32 }>* %block to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* byval [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* [[BL_I8]]) enqueue_kernel(default_queue, flags, ndrange, ^(void) { a[i] = b[i]; }); + // Emits block literal on stack and invoke function [[INVL2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[WAIT_EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %event_wait_list to %opencl.clk_event_t{{.*}}* addrspace(4)* // COMMON: [[EVNT:%[0-9]+]] = addrspacecast %opencl.clk_event_t{{.*}}** %clk_event to %opencl.clk_event_t{{.*}}* addrspace(4)* + // COMMON: store i8 addrspace(4)* addrspacecast (i8* bitcast (void (i8 addrspace(1)*)* [[INVL2:@__device_side_enqueue_block_invoke[^ ]*]] to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke // COMMON: [[BL:%[0-9]+]] = bitcast <{ i32, i32, i8 addrspace(4)*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()* // COMMON: [[BL_I8:%[0-9]+]] = addrspacecast void ()* [[BL]] to i8 addrspace(4)* - // COMMON: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]]) + // COMMON-LABEL: call i32 @__enqueue_kernel_basic_events + // COMMON-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* [[BL_I8]]) enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event, ^(void) { a[i] = b[i]; }); + // Emits global block literal [[BLG1]] and invoke function [[INVG1]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG1]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 256); char c; + // Emits global block literal [[BLG2]] and invoke function [[INVG2]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG2]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, c); + // Emits global block literal [[BLG3]] and invoke function [[INVG3]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -88,17 +124,20 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 256, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 256, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}} [[WAIT_EVNT]], %opencl.clk_event_t{{.*}} [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG3]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; }, 256); + // Emits global block literal [[BLG4]] and invoke function [[INVG4]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // COMMON: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0 @@ -107,11 +146,13 @@ // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_events_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_events_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t{{.*}}* addrspace(4)* [[WAIT_EVNT]], %opencl.clk_event_t{{.*}}* addrspace(4)* [[EVNT]], i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG4]] to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event, ^(local void *p) { return; @@ -119,22 +160,26 @@ c); long l; + // Emits global block literal [[BLG5]] and invoke function [[INVG5]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 %{{.*}}, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 %{{.*}}, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, l); + // Emits global block literal [[BLG6]] and invoke function [[INVG6]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t{{.*}}*, %opencl.queue_t{{.*}}** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [3 x i32] @@ -144,7 +189,8 @@ // B32: store i32 2, i32* %[[TMP2]], align 4 // B32: %[[TMP3:.*]] = getelementptr [3 x i32], [3 x i32]* %[[TMP]], i32 0, i32 2 // B32: store i32 4, i32* %[[TMP3]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [3 x i64] // B64: %[[TMP1:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 1, i64* %[[TMP1]], align 8 @@ -152,52 +198,128 @@ // B64: store i64 2, i64* %[[TMP2]], align 8 // B64: %[[TMP3:.*]] = getelementptr [3 x i64], [3 x i64]* %[[TMP]], i32 0, i32 2 // B64: store i64 4, i64* %[[TMP3]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 3, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p1, local void *p2, local void *p3) { return; }, 1, 2, 4); + // Emits global block literal [[BLG7]] and invoke function [[INVG7]]. // COMMON: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue // COMMON: [[FLAGS:%[0-9]+]] = load i32, i32* %flags // B32: %[[TMP:.*]] = alloca [1 x i32] // B32: %[[TMP1:.*]] = getelementptr [1 x i32], [1 x i32]* %[[TMP]], i32 0, i32 0 // B32: store i32 0, i32* %[[TMP1]], align 4 - // B32: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) + // B32-LABEL: call i32 @__enqueue_kernel_vaargs + // B32-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i32* %[[TMP1]]) // B64: %[[TMP:.*]] = alloca [1 x i64] // B64: %[[TMP1:.*]] = getelementptr [1 x i64], [1 x i64]* %[[TMP]], i32 0, i32 0 // B64: store i64 4294967296, i64* %[[TMP1]], align 8 - // B64: call i32 @__enqueue_kernel_vaargs(%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) + // B64-LABEL: call i32 @__enqueue_kernel_vaargs + // B64-SAME: (%opencl.queue_t{{.*}}* [[DEF_Q]], i32 [[FLAGS]], %struct.ndrange_t* [[NDR]]{{([0-9]+)?}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* @__block_literal_global{{(.[0-9]+)?}} to i8 addrspace(1)*) to i8 addrspace(4)*), i32 1, i64* %[[TMP1]]) enqueue_kernel(default_queue, flags, ndrange, ^(local void *p) { return; }, 4294967296L); + // Emits global block literal [[BLG8]] and invoke function [[INVG8]]. // The full type of these expressions are long (and repeated elsewhere), so we // capture it as part of the regex for convenience and clarity. - // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A:@__block_literal_global(\.[0-9]+)?]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A + // COMMON: store void () addrspace(4)* addrspacecast (void () addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to void () addrspace(1)*) to void () addrspace(4)*), void () addrspace(4)** %block_A void (^const block_A)(void) = ^{ return; }; - // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B:@__block_literal_global(\.[0-9]+)?]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B + // Emits global block literal [[BLG9]] and invoke function [[INVG9]]. + // COMMON: store void (i8 addrspace(3)*) addrspace(4)* addrspacecast (void (i8 addrspace(3)*) addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9]] to void (i8 addrspace(3)*) addrspace(1)*) to void (i8 addrspace(3)*) addrspace(4)*), void (i8 addrspace(3)*) addrspace(4)** %block_B void (^const block_B)(local void *) = ^(local void *a) { return; }; - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG8K]] and invoke function [[INVG8K]]. [[INVG8K]] is the same as [[INV8]] except calling convention, ABI and metadata. + // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) unsigned size = get_kernel_work_group_size(block_A); - // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_B]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8]] and invoke function [[INVG8]]. Make sure no redundant block literal and invoke functions are emitted. + // COMMON: [[r1:%.*]] = load i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* getelementptr inbounds (%struct.__opencl_block_literal_generic, %struct.__opencl_block_literal_generic addrspace(4)* addrspacecast (%struct.__opencl_block_literal_generic addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to %struct.__opencl_block_literal_generic addrspace(1)*) to %struct.__opencl_block_literal_generic addrspace(4)*), i32 0, i32 2) + // COMMON: [[r2:%.*]] = addrspacecast i8 addrspace(4)* [[r1]] to void (i8 addrspace(4)*)* + // COMMON: call spir_func void [[r2]](i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + block_A(); + + // Emits global block literal [[BLG9K]] and invoke function [[INVG9K]]. [[INVG9K]] is the same as [[INV9]] except calling convention, ABI and metadata. + // COMMON: call i32 @__get_kernel_work_group_size_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG9K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_work_group_size(block_B); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_A]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BLG8K]] and invoke function [[INVG8K]]. Make sure no redundant block literal ind invoke functions are emitted. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG8K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_A); - // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL]] to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Uses global block literal [[BL_GLOBAL_K]] and invoke function [[INV_G_K]]. [[INV_G_K]] is the same as [[INV_G]] except calling convention, ABI and metadata. + // COMMON: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BL_GLOBAL_K]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_preferred_work_group_size_multiple(block_G); - // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + // Emits global block literal [[BLG10]] and invoke function [[INVG10]]. + // COMMON: call i32 @__get_kernel_max_sub_group_size_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG10]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_max_sub_group_size_for_ndrange(ndrange, ^(){}); - // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* {{.*}} to i8 addrspace(1)*) to i8 addrspace(4)*)) + + // Emits global block literal [[BLG11]] and invoke function [[INVG11]]. + // COMMON: call i32 @__get_kernel_sub_group_count_for_ndrange_impl(%struct.ndrange_t* {{.*}}, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* bitcast ({ i32, i32, i8 addrspace(4)* } addrspace(1)* [[BLG11]] to i8 addrspace(1)*) to i8 addrspace(4)*)) size = get_kernel_sub_group_count_for_ndrange(ndrange, ^(){}); } + +// COMMON: define internal spir_kernel void [[INVL1]](i8 addrspace(1)* %{{.*}}) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVL2]](i8 addrspace(1)* %{{.*}}) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG1]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG2]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG3]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG4]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG5]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG6]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p1, i8 addrspace(3)* %p2, i8 addrspace(3)* %p3) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG7]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %p) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_func void [[INVG8]](i8 addrspace(4)* %{{.*}}) +// COMMON-NOT: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_func void [[INVG9]](i8 addrspace(4)* %{{.*}}, i8 addrspace(3)* %a) +// COMMON-NOT: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG8K]](i8 addrspace(1)* %{{.*}}) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG9K]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %a) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INV_G_K]](i8 addrspace(1)* %{{.*}}, i8 addrspace(3)* %a) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG10]](i8 addrspace(1)* %{{.*}}) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}} + +// COMMON: define internal spir_kernel void [[INVG11]](i8 addrspace(1)* %{{.*}}) +// COMMON-SAME: #{{[0-9]+}} !kernel_arg_addr_space !{{.*}} !kernel_arg_access_qual !{{.*}} !kernel_arg_type !{{.*}} !kernel_arg_base_type !{{.*}} !kernel_arg_type_qual !{{.*}}