diff --git a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp --- a/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPUOpenMP.cpp @@ -114,6 +114,10 @@ } } + AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "amdgcn", + SubArchName, + /* bitcode SDL?*/ true, + /* PostClang Link? */ false); // Add an intermediate output file. CmdArgs.push_back("-o"); const char *OutputFileName = diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -7734,12 +7734,28 @@ Triples += Action::GetOffloadKindName(CurKind); Triples += '-'; Triples += CurTC->getTriple().normalize(); - if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_OpenMP || - CurKind == Action::OFK_Cuda) && + if ((CurKind == Action::OFK_HIP || CurKind == Action::OFK_Cuda) && CurDep->getOffloadingArch()) { Triples += '-'; Triples += CurDep->getOffloadingArch(); } + + // TODO: Replace parsing of -march flag. Can be done by storing GPUArch + // with each toolchain. + StringRef GPUArchName; + if (CurKind == Action::OFK_OpenMP) { + // Extract GPUArch from -march argument in TC argument list. + for (unsigned ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) { + auto ArchStr = StringRef(TCArgs.getArgString(ArgIndex)); + auto Arch = ArchStr.startswith_insensitive("-march="); + if (Arch) { + GPUArchName = ArchStr.substr(7); + Triples += "-"; + break; + } + } + Triples += GPUArchName.str(); + } } CmdArgs.push_back(TCArgs.MakeArgString(Triples)); @@ -7813,12 +7829,27 @@ Triples += '-'; Triples += Dep.DependentToolChain->getTriple().normalize(); if ((Dep.DependentOffloadKind == Action::OFK_HIP || - Dep.DependentOffloadKind == Action::OFK_OpenMP || Dep.DependentOffloadKind == Action::OFK_Cuda) && !Dep.DependentBoundArch.empty()) { Triples += '-'; Triples += Dep.DependentBoundArch; } + // TODO: Replace parsing of -march flag. Can be done by storing GPUArch + // with each toolchain. + StringRef GPUArchName; + if (Dep.DependentOffloadKind == Action::OFK_OpenMP) { + // Extract GPUArch from -march argument in TC argument list. + for (uint ArgIndex = 0; ArgIndex < TCArgs.size(); ArgIndex++) { + StringRef ArchStr = StringRef(TCArgs.getArgString(ArgIndex)); + auto Arch = ArchStr.startswith_insensitive("-march="); + if (Arch) { + GPUArchName = ArchStr.substr(7); + Triples += "-"; + break; + } + } + Triples += GPUArchName.str(); + } } CmdArgs.push_back(TCArgs.MakeArgString(Triples)); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -49,6 +49,39 @@ llvm::opt::ArgStringList &CmdArgs, const llvm::opt::ArgList &Args); +void AddStaticDeviceLibsLinking(Compilation &C, const Tool &T, + const JobAction &JA, + const InputInfoList &Inputs, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CmdArgs, + StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink); +void AddStaticDeviceLibsPostLinking(const Driver &D, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CmdArgs, + StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink); +void AddStaticDeviceLibs(Compilation *C, const Tool *T, const JobAction *JA, + const InputInfoList *Inputs, const Driver &D, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CmdArgs, StringRef Arch, + StringRef Target, bool isBitCodeSDL, + bool postClangLink); + +bool SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CmdArgs, + SmallVector LibraryPaths, std::string Lib, + StringRef Arch, StringRef Target, bool isBitCodeSDL, + bool postClangLink); + +bool GetSDLFromOffloadArchive(Compilation &C, const Driver &D, const Tool &T, + const JobAction &JA, const InputInfoList &Inputs, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + SmallVector LibraryPaths, + StringRef Lib, StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink); + const char *SplitDebugName(const JobAction &JA, const llvm::opt::ArgList &Args, const InputInfo &Input, const InputInfo &Output); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -34,6 +34,7 @@ #include "clang/Driver/Util.h" #include "clang/Driver/XRayArgs.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -1587,6 +1588,292 @@ } } +/// SDLSearch: Search for Static Device Library +/// The search for SDL bitcode files is consistent with how static host +/// libraries are discovered. That is, the -l option triggers a search for +/// files in a set of directories called the LINKPATH. The host library search +/// procedure looks for a specific filename in the LINKPATH. The filename for +/// a host library is lib.a or lib.so. For SDLs, there is an +/// ordered-set of filenames that are searched. We call this ordered-set of +/// filenames as SEARCH-ORDER. Since an SDL can either be device-type specific, +/// architecture specific, or generic across all architectures, a naming +/// convention and search order is used where the file name embeds the +/// architecture name (nvptx or amdgcn) and the GPU device type +/// such as sm_30 and gfx906. is absent in case of +/// device-independent SDLs. To reduce congestion in host library directories, +/// the search first looks for files in the “libdevice” subdirectory. SDLs that +/// are bc files begin with the prefix “lib”. +/// +/// Machine-code SDLs can also be managed as an archive (*.a file). The +/// convention has been to use the prefix “lib”. To avoid confusion with host +/// archive libraries, we use prefix "libbc-" for the bitcode SDL archives. +/// +bool tools::SDLSearch(const Driver &D, const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + SmallVector LibraryPaths, std::string Lib, + StringRef Arch, StringRef Target, bool isBitCodeSDL, + bool postClangLink) { + SmallVector SDLs; + + std::string LibDeviceLoc = "/libdevice"; + std::string LibBcPrefix = "/libbc-"; + std::string LibPrefix = "/lib"; + + if (isBitCodeSDL) { + // SEARCH-ORDER for Bitcode SDLs: + // libdevice/libbc---.a + // libbc---.a + // libdevice/libbc--.a + // libbc--.a + // libdevice/libbc-.a + // libbc-.a + // libdevice/lib--.bc + // lib--.bc + // libdevice/lib-.bc + // lib-.bc + // libdevice/lib.bc + // lib.bc + + for (StringRef Base : {LibBcPrefix, LibPrefix}) { + const auto *Ext = Base.contains(LibBcPrefix) ? ".a" : ".bc"; + + for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(), + Twine(Lib + "-" + Arch).str(), Twine(Lib).str()}) { + SDLs.push_back(Twine(LibDeviceLoc + Base + Suffix + Ext).str()); + SDLs.push_back(Twine(Base + Suffix + Ext).str()); + } + } + } else { + // SEARCH-ORDER for Machine-code SDLs: + // libdevice/lib--.a + // lib--.a + // libdevice/lib-.a + // lib-.a + + const auto *Ext = ".a"; + + for (auto Suffix : {Twine(Lib + "-" + Arch + "-" + Target).str(), + Twine(Lib + "-" + Arch).str()}) { + SDLs.push_back(Twine(LibDeviceLoc + LibPrefix + Suffix + Ext).str()); + SDLs.push_back(Twine(LibPrefix + Suffix + Ext).str()); + } + } + + // The CUDA toolchain does not use a global device llvm-link before the LLVM + // backend generates ptx. So currently, the use of bitcode SDL for nvptx is + // only possible with post-clang-cc1 linking. Clang cc1 has a feature that + // will link libraries after clang compilation while the LLVM IR is still in + // memory. This utilizes a clang cc1 option called “-mlink-builtin-bitcode”. + // This is a clang -cc1 option that is generated by the clang driver. The + // option value must a full path to an existing file. + bool FoundSDL = false; + for (auto LPath : LibraryPaths) { + for (auto SDL : SDLs) { + auto FullName = Twine(LPath + SDL).str(); + if (llvm::sys::fs::exists(FullName)) { + if (postClangLink) + CC1Args.push_back("-mlink-builtin-bitcode"); + CC1Args.push_back(DriverArgs.MakeArgString(FullName)); + FoundSDL = true; + break; + } + } + if (FoundSDL) + break; + } + return FoundSDL; +} + +/// Search if a user provided archive file lib.a exists in any of +/// the library paths. If so, add a new command to clang-offload-bundler to +/// unbundle this archive and create a temporary device specific archive. Name +/// of this SDL is passed to the llvm-link (for amdgcn) or to the +/// clang-nvlink-wrapper (for nvptx) commands by the driver. +bool tools::GetSDLFromOffloadArchive( + Compilation &C, const Driver &D, const Tool &T, const JobAction &JA, + const InputInfoList &Inputs, const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, SmallVector LibraryPaths, + StringRef Lib, StringRef Arch, StringRef Target, bool isBitCodeSDL, + bool postClangLink) { + + // We don't support bitcode archive bundles for nvptx + if (isBitCodeSDL && Arch.contains("nvptx")) + return false; + + bool FoundAOB = false; + SmallVector AOBFileNames; + std::string ArchiveOfBundles; + for (auto LPath : LibraryPaths) { + ArchiveOfBundles.clear(); + + AOBFileNames.push_back(Twine(LPath + "/libdevice/lib" + Lib + ".a").str()); + AOBFileNames.push_back(Twine(LPath + "/lib" + Lib + ".a").str()); + + for (auto AOB : AOBFileNames) { + if (llvm::sys::fs::exists(AOB)) { + ArchiveOfBundles = AOB; + FoundAOB = true; + break; + } + } + + if (!FoundAOB) + continue; + + StringRef Prefix = isBitCodeSDL ? "libbc-" : "lib"; + std::string OutputLib = D.GetTemporaryPath( + Twine(Prefix + Lib + "-" + Arch + "-" + Target).str(), "a"); + + C.addTempFile(C.getArgs().MakeArgString(OutputLib.c_str())); + + ArgStringList CmdArgs; + SmallString<128> DeviceTriple; + DeviceTriple += Action::GetOffloadKindName(JA.getOffloadingDeviceKind()); + DeviceTriple += '-'; + std::string NormalizedTriple = T.getToolChain().getTriple().normalize(); + DeviceTriple += NormalizedTriple; + if (!Target.empty()) { + DeviceTriple += '-'; + DeviceTriple += Target; + } + + std::string UnbundleArg("-unbundle"); + std::string TypeArg("-type=a"); + std::string InputArg("-inputs=" + ArchiveOfBundles); + std::string OffloadArg("-targets=" + std::string(DeviceTriple)); + std::string OutputArg("-outputs=" + OutputLib); + + const char *UBProgram = DriverArgs.MakeArgString( + T.getToolChain().GetProgramPath("clang-offload-bundler")); + + ArgStringList UBArgs; + UBArgs.push_back(C.getArgs().MakeArgString(UnbundleArg.c_str())); + UBArgs.push_back(C.getArgs().MakeArgString(TypeArg.c_str())); + UBArgs.push_back(C.getArgs().MakeArgString(InputArg.c_str())); + UBArgs.push_back(C.getArgs().MakeArgString(OffloadArg.c_str())); + UBArgs.push_back(C.getArgs().MakeArgString(OutputArg.c_str())); + + // Add this flag to not exit from clang-offload-bundler if no compatible + // code object is found in heterogenous archive library. + std::string AdditionalArgs("-allow-missing-bundles"); + UBArgs.push_back(C.getArgs().MakeArgString(AdditionalArgs.c_str())); + + C.addCommand(std::make_unique( + JA, T, ResponseFileSupport::AtFileCurCP(), UBProgram, UBArgs, Inputs, + InputInfo(&JA, C.getArgs().MakeArgString(OutputLib.c_str())))); + if (postClangLink) + CC1Args.push_back("-mlink-builtin-bitcode"); + + CC1Args.push_back(DriverArgs.MakeArgString(OutputLib)); + break; + } + + return FoundAOB; +} + +// Wrapper function used by driver for adding SDLs during link phase. +void tools::AddStaticDeviceLibsLinking(Compilation &C, const Tool &T, + const JobAction &JA, + const InputInfoList &Inputs, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink) { + AddStaticDeviceLibs(&C, &T, &JA, &Inputs, C.getDriver(), DriverArgs, CC1Args, + Arch, Target, isBitCodeSDL, postClangLink); +} + +// Wrapper function used for post clang linking of bitcode SDLS for nvptx by +// the CUDA toolchain. +void tools::AddStaticDeviceLibsPostLinking(const Driver &D, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink) { + AddStaticDeviceLibs(nullptr, nullptr, nullptr, nullptr, D, DriverArgs, + CC1Args, Arch, Target, isBitCodeSDL, postClangLink); +} + +// User defined Static Device Libraries(SDLs) can be passed to clang for +// offloading GPU compilers. Like static host libraries, the use of a SDL is +// specified with the -l command line option. The primary difference between +// host and SDLs is the filenames for SDLs (refer SEARCH-ORDER for Bitcode SDLs +// and SEARCH-ORDER for Machine-code SDLs for the naming convention). +// SDLs are of following types: +// +// * Bitcode SDLs: They can either be a *.bc file or an archive of *.bc files. +// For NVPTX, these libraries are post-clang linked following each +// compilation. For AMDGPU, these libraries are linked one time +// during the application link phase. +// +// * Machine-code SDLs: They are archive files. For NVPTX, the archive members +// contain cubin for Nvidia GPUs and are linked one time during the +// link phase by the CUDA SDK linker called nvlink. For AMDGPU, the +// process for machine code SDLs is still in development. But they +// will be linked by the LLVM tool lld. +// +// * Bundled objects that contain both host and device codes: Bundled objects +// may also contain library code compiled from source. For NVPTX, the +// bundle contains cubin. For AMDGPU, the bundle contains bitcode. +// +// For Bitcode and Machine-code SDLs, current compiler toolchains hardcode the +// inclusion of specific SDLs such as math libraries and the OpenMP device +// library libomptarget. +void tools::AddStaticDeviceLibs(Compilation *C, const Tool *T, + const JobAction *JA, + const InputInfoList *Inputs, const Driver &D, + const llvm::opt::ArgList &DriverArgs, + llvm::opt::ArgStringList &CC1Args, + StringRef Arch, StringRef Target, + bool isBitCodeSDL, bool postClangLink) { + + SmallVector LibraryPaths; + // Add search directories from LIBRARY_PATH env variable + llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH"); + if (LibPath) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (StringRef Path : Frags) + LibraryPaths.emplace_back(Path.trim()); + } + + // Add directories from user-specified -L options + for (std::string Search_Dir : DriverArgs.getAllArgValues(options::OPT_L)) + LibraryPaths.emplace_back(Search_Dir); + + // Add path to lib-debug folders + SmallString<256> DefaultLibPath = llvm::sys::path::parent_path(D.Dir); + llvm::sys::path::append(DefaultLibPath, Twine("lib") + CLANG_LIBDIR_SUFFIX); + LibraryPaths.emplace_back(DefaultLibPath.c_str()); + + // Build list of Static Device Libraries SDLs specified by -l option + llvm::SmallSet SDLNames; + static const StringRef HostOnlyArchives[] = { + "omp", "cudart", "m", "gcc", "gcc_s", "pthread", "hip_hcc"}; + for (auto SDLName : DriverArgs.getAllArgValues(options::OPT_l)) { + if (!HostOnlyArchives->contains(SDLName)) { + SDLNames.insert(SDLName); + } + } + + // The search stops as soon as an SDL file is found. The driver then provides + // the full filename of the SDL to the llvm-link or clang-nvlink-wrapper + // command. If no SDL is found after searching each LINKPATH with + // SEARCH-ORDER, it is possible that an archive file lib.a exists + // and may contain bundled object files. + for (auto SDLName : SDLNames) { + // This is the only call to SDLSearch + if (!SDLSearch(D, DriverArgs, CC1Args, LibraryPaths, SDLName, Arch, Target, + isBitCodeSDL, postClangLink)) { + GetSDLFromOffloadArchive(*C, D, *T, *JA, *Inputs, DriverArgs, CC1Args, + LibraryPaths, SDLName, Arch, Target, + isBitCodeSDL, postClangLink); + } + } +} + static llvm::opt::Arg * getAMDGPUCodeObjectArgument(const Driver &D, const llvm::opt::ArgList &Args) { // The last of -mcode-object-v3, -mno-code-object-v3 and diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -610,8 +610,11 @@ CmdArgs.push_back(CubinF); } + AddStaticDeviceLibsLinking(C, *this, JA, Inputs, Args, CmdArgs, "nvptx", GPUArch, + false, false); + const char *Exec = - Args.MakeArgString(getToolChain().GetProgramPath("nvlink")); + Args.MakeArgString(getToolChain().GetProgramPath("clang-nvlink-wrapper")); C.addCommand(std::make_unique( JA, *this, ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8, @@ -741,6 +744,8 @@ addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix, getTriple()); + AddStaticDeviceLibsPostLinking(getDriver(), DriverArgs, CC1Args, "nvptx", GpuArch, + /* bitcode SDL?*/ true, /* PostClang Link? */ true); } } diff --git a/clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a b/clang/test/Driver/Inputs/openmp_static_device_link/libFatArchive.a new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@&1 | FileCheck %s +// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm-bc"{{.*}}"-target-cpu" "[[GPU:gfx[0-9]+]]"{{.*}}"-o" "[[HOSTBC:.*.bc]]" "-x" "c++"{{.*}}.cpp +// CHECK: clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}/Inputs/openmp_static_device_link/libFatArchive.a" "-targets=openmp-amdgcn-amd-amdhsa-[[GPU]]" "-outputs=[[DEVICESPECIFICARCHIVE:.*.a]]" "-allow-missing-bundles" +// CHECK: llvm-link{{.*}}"[[HOSTBC]]" "[[DEVICESPECIFICARCHIVE]]" "-o" "{{.*}}-[[GPU]]-linked-{{.*}}.bc" +// CHECK: ld"{{.*}}" "-L{{.*}}/Inputs/openmp_static_device_link" "{{.*}} "-lFatArchive" "{{.*}}" "-lomp{{.*}}-lomptarget" +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +#define N 10 + +#pragma omp declare target +// Functions defined in Fat Archive. +extern "C" void func_present(float *, float *, unsigned); + +#ifdef MISSING +// Function not defined in the fat archive. +extern "C" void func_missing(float *, float *, unsigned); +#endif + +#pragma omp end declare target + +int main() { + float in[N], out[N], sum = 0; + unsigned i; + +#pragma omp parallel for + for (i = 0; i < N; ++i) { + in[i] = i; + } + + func_present(in, out, N); // Returns out[i] = a[i] * 0 + +#ifdef MISSING + func_missing(in, out, N); // Should throw an error here +#endif + +#pragma omp parallel for reduction(+ \ + : sum) + for (i = 0; i < N; ++i) + sum += out[i]; + + if (!sum) + return 0; + return sum; +} + +#endif + +/*********************************************** + Steps to create Fat Archive (libFatArchive.a) +************************************************ +***************** File: func_1.c *************** +void func_present(float* in, float* out, unsigned n){ + unsigned i; + #pragma omp target teams distribute parallel for map(to: in[0:n]) map(from: out[0:n]) + for(i=0; i&1 | FileCheck %s +// CHECK: clang{{.*}}"-cc1"{{.*}}"-triple" "nvptx64"{{.*}}"-target-cpu" "[[GPU:sm_[0-9]+]]"{{.*}}"-o" "[[HOSTBC:.*.s]]" "-x" "c++"{{.*}}.cpp +// CHECK: clang-offload-bundler" "-unbundle" "-type=a" "-inputs={{.*}}/Inputs/openmp_static_device_link/libFatArchive.a" "-targets=openmp-nvptx64-[[GPU]]" "-outputs=[[DEVICESPECIFICARCHIVE:.*.a]]" "-allow-missing-bundles" +// CHECK: clang-nvlink-wrapper{{.*}}"-o" "{{.*}}.out" "-arch" "[[GPU]]" "{{.*}}[[DEVICESPECIFICARCHIVE]]" +// CHECK: ld"{{.*}}" "-L{{.*}}/Inputs/openmp_static_device_link" "{{.*}} "-lFatArchive" "{{.*}}" "-lomp{{.*}}-lomptarget" +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +#define N 10 + +#pragma omp declare target +// Functions defined in Fat Archive. +extern "C" void func_present(float *, float *, unsigned); + +#ifdef MISSING +// Function not defined in the fat archive. +extern "C" void func_missing(float *, float *, unsigned); +#endif + +#pragma omp end declare target + +int main() { + float in[N], out[N], sum = 0; + unsigned i; + +#pragma omp parallel for + for (i = 0; i < N; ++i) { + in[i] = i; + } + + func_present(in, out, N); // Returns out[i] = a[i] * 0 + +#ifdef MISSING + func_missing(in, out, N); // Should throw an error here +#endif + +#pragma omp parallel for reduction(+ \ + : sum) + for (i = 0; i < N; ++i) + sum += out[i]; + + if (!sum) + return 0; + return sum; +} + +#endif + +/*********************************************** + Steps to create Fat Archive (libFatArchive.a) +************************************************ +***************** File: func_1.c *************** +void func_present(float* in, float* out, unsigned n){ + unsigned i; + #pragma omp target teams distribute parallel for map(to: in[0:n]) map(from: out[0:n]) + for(i=0; i