Index: include/llvm/Support/AMDGPUCodeObjectMetadata.h =================================================================== --- include/llvm/Support/AMDGPUCodeObjectMetadata.h +++ include/llvm/Support/AMDGPUCodeObjectMetadata.h @@ -115,6 +115,8 @@ constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint"; /// \brief Key for Kernel::Attr::Metadata::mVecTypeHint. constexpr char VecTypeHint[] = "VecTypeHint"; +/// \brief Key for Kernel::Attr::Metadata::mRuntimeHandle. +constexpr char RuntimeHandle[] = "RuntimeHandle"; } // end namespace Key /// \brief In-memory representation of kernel attributes metadata. @@ -125,15 +127,17 @@ std::vector mWorkGroupSizeHint = std::vector(); /// \brief 'vec_type_hint' attribute. Optional. std::string mVecTypeHint = std::string(); + /// \brief External symbol created by runtime to store the kernel address + /// for enqueued blocks. + std::string mRuntimeHandle = std::string(); /// \brief Default constructor. Metadata() = default; /// \returns True if kernel attributes metadata is empty, false otherwise. bool empty() const { - return mReqdWorkGroupSize.empty() && - mWorkGroupSizeHint.empty() && - mVecTypeHint.empty(); + return mReqdWorkGroupSize.empty() && mWorkGroupSizeHint.empty() && + mVecTypeHint.empty() && mRuntimeHandle.empty(); } /// \returns True if kernel attributes metadata is not empty, false otherwise. Index: lib/Support/AMDGPUCodeObjectMetadata.cpp =================================================================== --- lib/Support/AMDGPUCodeObjectMetadata.cpp +++ lib/Support/AMDGPUCodeObjectMetadata.cpp @@ -96,6 +96,8 @@ MD.mWorkGroupSizeHint, std::vector()); YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint, MD.mVecTypeHint, std::string()); + YIO.mapOptional(Kernel::Attrs::Key::RuntimeHandle, MD.mRuntimeHandle, + std::string()); } }; Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -185,6 +185,10 @@ Pass *createAMDGPUFunctionInliningPass(); void initializeAMDGPUInlinerPass(PassRegistry&); +ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); +void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); +extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); Index: lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp @@ -0,0 +1,99 @@ +//===- AMDGPUOpenCLEnqueuedBlockLowering.cpp - Lower enqueued block -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// \brief This post-linking pass replaces the function pointer of enqueued +// block kernel with a global variable (runtime handle) and adds +// "runtime-handle" attribute to the enqueued block kernel. +// +// In LLVM CodeGen the runtime-handle metadata will be translated to +// RuntimeHandle metadata in code object. Runtime allocates a global buffer +// for each kernel with RuntimeHandel metadata and saves the kernel address +// required for the AQL packet into the buffer. __enqueue_kernel function +// in device library knows that the invoke function pointer in the block +// literal is actually runtime handle and loads the kernel address from it +// and put it into AQL packet for dispatching. +// +// This cannot be done in FE since FE cannot create a unique global variable +// with external linkage across LLVM modules. The global variable with internal +// linkage does not work since optimization passes will try to replace loads +// of the global variable with its initialization value. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "amdgpu-lower-enqueued-block" + +using namespace llvm; + +namespace { + + /// \brief Unify multiple OpenCL metadata due to linking. + class AMDGPUOpenCLEnqueuedBlockLowering : public ModulePass { + public: + static char ID; + + explicit AMDGPUOpenCLEnqueuedBlockLowering() : ModulePass(ID) {} + + private: + bool runOnModule(Module &M) override; +}; + +} // end anonymous namespace + +char AMDGPUOpenCLEnqueuedBlockLowering::ID = 0; + +char &llvm::AMDGPUOpenCLEnqueuedBlockLoweringID = + AMDGPUOpenCLEnqueuedBlockLowering::ID; + +INITIALIZE_PASS(AMDGPUOpenCLEnqueuedBlockLowering, + "amdgpu-lower-enqueued-block", "Lower OpenCL enqueued blocks", + false, false) + +ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() { + return new AMDGPUOpenCLEnqueuedBlockLowering(); +} + +bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) { + auto &C = M.getContext(); + auto AS = AMDGPU::getAMDGPUAS(M); + bool Changed = false; + for (auto &I : M.functions()) { + if (I.hasFnAttribute("enqueued-block")) { + if (!I.hasOneUse() || !I.user_begin()->hasOneUse() || + !isa(*I.user_begin()) || + !isa(*I.user_begin()->user_begin())) { + continue; + } + auto *BitCast = cast(*I.user_begin()); + auto *AddrCast = cast(*BitCast->user_begin()); + std::string RuntimeHandle = I.getName().str() + "_runtime_handle"; + auto *GV = new GlobalVariable( + M, Type::getInt8Ty(C)->getPointerTo(AS.GLOBAL_ADDRESS), + /*IsConstant=*/true, GlobalValue::ExternalLinkage, + /*Initializer=*/nullptr, RuntimeHandle, /*InsertBefore=*/nullptr, + GlobalValue::NotThreadLocal, AS.GLOBAL_ADDRESS, + /*IsExternallyInitialized=*/true); + DEBUG(llvm::dbgs() << "runtime handle created: " << *GV << '\n'); + auto *NewPtr = ConstantExpr::getPointerCast(GV, AddrCast->getType()); + AddrCast->replaceAllUsesWith(NewPtr); + I.addFnAttr("runtime-handle", RuntimeHandle); + I.setLinkage(GlobalValue::ExternalLinkage); + Changed = true; + } + } + return Changed; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -161,6 +161,7 @@ initializeAMDGPUAnnotateUniformValuesPass(*PR); initializeAMDGPUArgumentUsageInfoPass(*PR); initializeAMDGPULowerIntrinsicsPass(*PR); + initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR); initializeAMDGPUPromoteAllocaPass(*PR); initializeAMDGPUCodeGenPreparePass(*PR); initializeAMDGPURewriteOutArgumentsPass(*PR); @@ -609,6 +610,9 @@ // Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments. addPass(createAMDGPUOpenCLImageTypeLoweringPass()); + // Replace OpenCL enqueued block function pointers with global variables. + addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass()); + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createInferAddressSpacesPass()); addPass(createAMDGPUPromoteAlloca()); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -39,6 +39,7 @@ AMDGPUMachineModuleInfo.cpp AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp + AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp AMDGPUPromoteAlloca.cpp AMDGPURegAsmNames.inc.cpp Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp @@ -244,6 +244,10 @@ cast(Node->getOperand(0))->getType(), mdconst::extract(Node->getOperand(1))->getZExtValue()); } + if (Func.hasFnAttribute("runtime-handle")) { + Attrs.mRuntimeHandle = + Func.getFnAttribute("runtime-handle").getValueAsString().str(); + } } void MetadataStreamer::emitKernelArgs(const Function &Func) { Index: test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll =================================================================== --- test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll +++ test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll @@ -14,6 +14,8 @@ %struct.B = type { i32 addrspace(1)*} %opencl.clk_event_t = type opaque +@__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* + ; CHECK: --- ; CHECK: Version: [ 1, 0 ] ; CHECK: Printf: @@ -1197,6 +1199,44 @@ ret void } +; CHECK: - Name: __test_block_invoke_kernel +; CHECK-NEXT: Language: OpenCL C +; CHECK-NEXT: LanguageVersion: [ 2, 0 ] +; CHECK-NEXT: Attrs: +; CHECK-NEXT: RuntimeHandle: __test_block_invoke_kernel_runtime_handle +; CHECK-NEXT: Args: +; CHECK-NEXT: - Size: 25 +; CHECK-NEXT: Align: 1 +; CHECK-NEXT: ValueKind: ByValue +; CHECK-NEXT: ValueType: Struct +; CHECK-NEXT: AccQual: Default +; CHECK-NEXT: TypeName: __block_literal +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ +; CHECK-NEXT: ValueType: I64 +; CHECK-NEXT: - Size: 8 +; CHECK-NEXT: Align: 8 +; CHECK-NEXT: ValueKind: HiddenPrintfBuffer +; CHECK-NEXT: ValueType: I8 +; CHECK-NEXT: AddrSpaceQual: Global +define amdgpu_kernel void @__test_block_invoke_kernel( + <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1 + !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !110 + !kernel_arg_base_type !110 !kernel_arg_type_qual !4 { + ret void +} + +attributes #1 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" } + !llvm.printf.fmts = !{!100, !101} !1 = !{i32 0} @@ -1250,13 +1290,14 @@ !94 = !{!"", !"", !"", !"", !"", !"", !""} !100 = !{!"1:1:4:%d\5Cn"} !101 = !{!"2:1:8:%g\5Cn"} +!110 = !{!"__block_literal"} ; NOTES: Displaying notes found at file offset 0x{{[0-9]+}} ; NOTES-NEXT: Owner Data size Description ; NOTES-NEXT: AMD 0x00000008 Unknown note type: (0x00000001) ; NOTES-NEXT: AMD 0x0000001b Unknown note type: (0x00000003) -; GFX700: AMD 0x00008b0a Unknown note type: (0x0000000a) -; GFX800: AMD 0x00008e6e Unknown note type: (0x0000000a) -; GFX900: AMD 0x00008b0a Unknown note type: (0x0000000a) +; GFX700: AMD 0x00008f64 Unknown note type: (0x0000000a) +; GFX800: AMD 0x000092e4 Unknown note type: (0x0000000a) +; GFX900: AMD 0x00008f64 Unknown note type: (0x0000000a) ; PARSER: AMDGPU Code Object Metadata Parser Test: PASS Index: test/CodeGen/AMDGPU/enqueue-kernel.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/enqueue-kernel.ll @@ -0,0 +1,102 @@ +; RUN: opt -amdgpu-lower-enqueued-block -S -verify-machineinstrs < %s | FileCheck %s + +; CHECK: @__test_block_invoke_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* +; CHECK: @__test_block_invoke_2_kernel_runtime_handle = external addrspace(1) externally_initialized constant i8 addrspace(1)* + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn-amdhsa-amd-opencl" + +%struct.ndrange_t = type { i32 } +%opencl.queue_t = type opaque + +define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 { +entry: + %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8 + %tmp = alloca %struct.ndrange_t, align 4 + %block2 = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, align 8 + %tmp3 = alloca %struct.ndrange_t, align 4 + %block.size = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 0 + store i32 25, i32* %block.size, align 8 + %block.align = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 1 + store i32 8, i32* %block.align, align 4 + %block.invoke = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>)* @__test_block_invoke_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke, align 8 + %block.captured = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 3 + store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured, align 8, !tbaa !7 + %block.captured1 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block, i32 0, i32 4 + store i8 %b, i8* %block.captured1, align 8, !tbaa !11 + %tmp1 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>* %block to void ()* + %tmp2 = bitcast void ()* %tmp1 to i8* + %tmp4 = addrspacecast i8* %tmp2 to i8 addrspace(4)* + %tmp5 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp, i8 addrspace(4)* nonnull %tmp4) #2 + %block.size4 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 0 + store i32 41, i32* %block.size4, align 8 + %block.align5 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 1 + store i32 8, i32* %block.align5, align 4 + %block.invoke6 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 2 + store i8 addrspace(4)* addrspacecast (i8* bitcast (void (<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>)* @__test_block_invoke_2_kernel to i8*) to i8 addrspace(4)*), i8 addrspace(4)** %block.invoke6, align 8 + %block.captured7 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 3 + store i8 addrspace(1)* %a, i8 addrspace(1)** %block.captured7, align 8, !tbaa !7 + %block.captured8 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 6 + store i8 %b, i8* %block.captured8, align 8, !tbaa !11 + %block.captured9 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 4 + store i64 addrspace(1)* %c, i64 addrspace(1)** %block.captured9, align 8, !tbaa !7 + %block.captured10 = getelementptr inbounds <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>, <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2, i32 0, i32 5 + store i64 %d, i64* %block.captured10, align 8, !tbaa !12 + %tmp6 = bitcast <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }>* %block2 to void ()* + %tmp7 = bitcast void ()* %tmp6 to i8* + %tmp8 = addrspacecast i8* %tmp7 to i8 addrspace(4)* + %tmp9 = call i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)* undef, i32 0, %struct.ndrange_t* byval nonnull %tmp3, i8 addrspace(4)* nonnull %tmp8) #2 + ret void +} + +; CHECK: define amdgpu_kernel void @__test_block_invoke_kernel({{.*}}) #[[AT1:[0-9]+]] +define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg) #1 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 3 + %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }> %arg, 4 + store i8 %.fca.4.extract, i8 addrspace(1)* %.fca.3.extract, align 1, !tbaa !11 + ret void +} + +declare i32 @__enqueue_kernel_basic(%opencl.queue_t addrspace(1)*, i32, %struct.ndrange_t*, i8 addrspace(4)*) local_unnamed_addr + +; CHECK: define amdgpu_kernel void @__test_block_invoke_2_kernel({{.*}}) #[[AT2:[0-9]+]] +define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg) #1 !kernel_arg_addr_space !14 !kernel_arg_access_qual !15 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !17 { +entry: + %.fca.3.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 3 + %.fca.4.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 4 + %.fca.5.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 5 + %.fca.6.extract = extractvalue <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i64 addrspace(1)*, i64, i8 }> %arg, 6 + store i8 %.fca.6.extract, i8 addrspace(1)* %.fca.3.extract, align 1, !tbaa !11 + store i64 %.fca.5.extract, i64 addrspace(1)* %.fca.4.extract, align 8, !tbaa !12 + ret void +} + +; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle" +; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle" + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind "enqueued-block" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 2, i32 0} +!3 = !{i32 1, i32 0, i32 1, i32 0} +!4 = !{!"none", !"none", !"none", !"none"} +!5 = !{!"char*", !"char", !"long*", !"long"} +!6 = !{!"", !"", !"", !""} +!7 = !{!8, !8, i64 0} +!8 = !{!"any pointer", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!9, !9, i64 0} +!12 = !{!13, !13, i64 0} +!13 = !{!"long", !9, i64 0} +!14 = !{i32 0} +!15 = !{!"none"} +!16 = !{!"__block_literal"} +!17 = !{!""}