Index: flang/docs/GettingStarted.md =================================================================== --- flang/docs/GettingStarted.md +++ flang/docs/GettingStarted.md @@ -180,6 +180,76 @@ ninja check-flang ``` +### Building flang runtime for accelerators +Flang runtime can be built for accelerators in experimental mode, i.e. +complete enabling is WIP. CUDA and OpenMP target offload builds +are currently supported. + +#### Building out-of-tree + +##### CUDA build +Clang with NVPTX backend and NVCC compilers are supported. + +```bash +cd llvm-project/flang +mkdir -rf build_flang_runtime +mkdir build_flang_runtime +cd build_flang_runtime + +cmake \ + -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \ + -DCMAKE_CUDA_ARCHITECTURES=80 \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_COMPILER=clang \ + ../runtime/ +make -j FortranRuntime +``` + +```bash +cd llvm-project/flang +mkdir -rf build_flang_runtime +mkdir build_flang_runtime +cd build_flang_runtime + +cmake \ + -DFLANG_EXPERIMENTAL_CUDA_RUNTIME=ON \ + -DCMAKE_CUDA_ARCHITECTURES=80 \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CUDA_COMPILER=nvcc \ + ../runtime/ +make -j FortranRuntime +``` + +The result of the build is a "fat" library with the host and device +code. Note that the packaging of the libraries is different +between [Clang](https://clang.llvm.org/docs/OffloadingDesign.html#linking-target-device-code) and NVCC, so the library must be linked using +compatible compiler drivers. + +##### OpenMP target offload build +Only Clang compiler is currently supported. + +``` +cd llvm-project/flang +mkdir -rf build_flang_runtime +mkdir build_flang_runtime +cd build_flang_runtime + +cmake \ + -DFLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD="host_device" \ + -DCMAKE_C_COMPILER=clang \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DFLANG_OMP_DEVICE_ARCHITECTURES="all" \ + ../runtime/ +make -j FortranRuntime +``` + +The result of the build is a "device-only" library, i.e. the host +part of the library is just a container for the device code. +The resulting library may be linked to user programs using +Clang-like device linking pipeline. + ## Supported C++ compilers Flang is written in C++17. Index: flang/include/flang/ISO_Fortran_binding.h =================================================================== --- flang/include/flang/ISO_Fortran_binding.h +++ flang/include/flang/ISO_Fortran_binding.h @@ -18,6 +18,8 @@ * implementation. */ +#include "Runtime/api-attrs.h" + #ifdef __cplusplus namespace Fortran { namespace ISO { @@ -121,8 +123,8 @@ // care of getting the memory storage. Note that it already contains one element // because a struct cannot be empty. template struct FlexibleArray : T { - T &operator[](int index) { return *(this + index); } - const T &operator[](int index) const { return *(this + index); } + RT_API_ATTRS T &operator[](int index) { return *(this + index); } + const RT_API_ATTRS T &operator[](int index) const { return *(this + index); } operator T *() { return this; } operator const T *() const { return this; } }; @@ -174,11 +176,11 @@ void *CFI_address(const CFI_cdesc_t *, const CFI_index_t subscripts[]); int CFI_allocate(CFI_cdesc_t *, const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[], size_t elem_len); -int CFI_deallocate(CFI_cdesc_t *); +RT_API_ATTRS int CFI_deallocate(CFI_cdesc_t *); int CFI_establish(CFI_cdesc_t *, void *base_addr, CFI_attribute_t, CFI_type_t, size_t elem_len, CFI_rank_t, const CFI_index_t extents[]); int CFI_is_contiguous(const CFI_cdesc_t *); -int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source, +RT_API_ATTRS int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source, const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[], const CFI_index_t strides[]); int CFI_select_part(CFI_cdesc_t *, const CFI_cdesc_t *source, Index: flang/include/flang/Runtime/api-attrs.h =================================================================== --- /dev/null +++ flang/include/flang/Runtime/api-attrs.h @@ -0,0 +1,91 @@ +/*===-- include/flang/Runtime/api-attrs.h ---------------------------*- C -*-=// + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +/* + * The file defines a set macros that can be used to apply + * different attributes/pragmas to functions/variables + * declared/defined/used in Flang runtime library. + */ + +#ifndef FORTRAN_RUNTIME_API_ATTRS_H_ +#define FORTRAN_RUNTIME_API_ATTRS_H_ + +/* + * RT_EXT_API_GROUP_BEGIN/END pair is placed around definitions + * of functions exported by Flang runtime library. They are the entry + * points that are referenced in the Flang generated code. + * The macros may be expanded into any construct that is valid to appear + * at C++ module scope. + */ +#ifndef RT_EXT_API_GROUP_BEGIN +#if defined(OMP_NOHOST_BUILD) +#define RT_EXT_API_GROUP_BEGIN \ + _Pragma("omp begin declare target device_type(nohost)") +#elif defined(OMP_OFFLOAD_BUILD) +#define RT_EXT_API_GROUP_BEGIN _Pragma("omp declare target") +#else +#define RT_EXT_API_GROUP_BEGIN +#endif +#endif /* !defined(RT_EXT_API_GROUP_BEGIN) */ + +#ifndef RT_EXT_API_GROUP_END +#if defined(OMP_NOHOST_BUILD) || defined(OMP_OFFLOAD_BUILD) +#define RT_EXT_API_GROUP_END _Pragma("omp end declare target") +#else +#define RT_EXT_API_GROUP_END +#endif +#endif /* !defined(RT_EXT_API_GROUP_END) */ + +/* + * RT_VAR_GROUP_BEGIN/END pair is placed around definitions + * of module scope variables referenced by Flang runtime (directly + * or indirectly). + * The macros may be expanded into any construct that is valid to appear + * at C++ module scope. + */ +#ifndef RT_VAR_GROUP_BEGIN +#define RT_VAR_GROUP_BEGIN RT_EXT_API_GROUP_BEGIN +#endif /* !defined(RT_VAR_GROUP_BEGIN) */ + +#ifndef RT_VAR_GROUP_END +#define RT_VAR_GROUP_END RT_EXT_API_GROUP_END +#endif /* !defined(RT_VAR_GROUP_END) */ + +/* + * Each non-exported function used by Flang runtime (e.g. via + * calling it or taking its address, etc.) is marked with + * RT_API_ATTRS. The macros is placed at both declaration and + * definition of such a function. + * The macros may be expanded into a construct that is valid + * to appear as part of a C++ decl-specifier. + */ +#ifndef RT_API_ATTRS +#if defined(__CUDACC__) || defined(__CUDA__) +#define RT_API_ATTRS __host__ __device__ +#else +#define RT_API_ATTRS +#endif +#endif /* !defined(RT_API_ATTRS) */ + +/* + * Each const/constexpr module scope variable referenced by Flang runtime + * (directly or indirectly) is marked with RT_CONST_VAR_ATTRS. + * The macros is placed at both declaration and definition of such a variable. + * The macros may be expanded into a construct that is valid + * to appear as part of a C++ decl-specifier. + */ +#ifndef RT_CONST_VAR_ATTRS +#if defined(__CUDACC__) || defined(__CUDA__) +#define RT_CONST_VAR_ATTRS __constant__ +#else +#define RT_CONST_VAR_ATTRS +#endif +#endif /* !defined(RT_CONST_VAR_ATTRS) */ + +#endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */ Index: flang/include/flang/Runtime/descriptor.h =================================================================== --- flang/include/flang/Runtime/descriptor.h +++ flang/include/flang/Runtime/descriptor.h @@ -37,19 +37,24 @@ using SubscriptValue = ISO::CFI_index_t; -static constexpr int maxRank{CFI_MAX_RANK}; +RT_VAR_GROUP_BEGIN +static constexpr RT_CONST_VAR_ATTRS int maxRank{CFI_MAX_RANK}; +RT_VAR_GROUP_END // A C++ view of the sole interoperable standard descriptor (ISO::CFI_cdesc_t) // and its type and per-dimension information. class Dimension { public: - SubscriptValue LowerBound() const { return raw_.lower_bound; } - SubscriptValue Extent() const { return raw_.extent; } - SubscriptValue UpperBound() const { return LowerBound() + Extent() - 1; } - SubscriptValue ByteStride() const { return raw_.sm; } + RT_API_ATTRS SubscriptValue LowerBound() const { return raw_.lower_bound; } + RT_API_ATTRS SubscriptValue Extent() const { return raw_.extent; } + RT_API_ATTRS SubscriptValue UpperBound() const { + return LowerBound() + Extent() - 1; + } + RT_API_ATTRS SubscriptValue ByteStride() const { return raw_.sm; } - Dimension &SetBounds(SubscriptValue lower, SubscriptValue upper) { + RT_API_ATTRS Dimension &SetBounds( + SubscriptValue lower, SubscriptValue upper) { if (upper >= lower) { raw_.lower_bound = lower; raw_.extent = upper - lower + 1; @@ -74,7 +79,7 @@ raw_.extent = extent; return *this; } - Dimension &SetByteStride(SubscriptValue bytes) { + RT_API_ATTRS Dimension &SetByteStride(SubscriptValue bytes) { raw_.sm = bytes; return *this; } @@ -91,29 +96,34 @@ // array is determined by derivedType_->LenParameters(). class DescriptorAddendum { public: - explicit DescriptorAddendum(const typeInfo::DerivedType *dt = nullptr) + explicit RT_API_ATTRS DescriptorAddendum( + const typeInfo::DerivedType *dt = nullptr) : derivedType_{dt} {} - DescriptorAddendum &operator=(const DescriptorAddendum &); + RT_API_ATTRS DescriptorAddendum &operator=(const DescriptorAddendum &); - const typeInfo::DerivedType *derivedType() const { return derivedType_; } - DescriptorAddendum &set_derivedType(const typeInfo::DerivedType *dt) { + const RT_API_ATTRS typeInfo::DerivedType *derivedType() const { + return derivedType_; + } + RT_API_ATTRS DescriptorAddendum &set_derivedType( + const typeInfo::DerivedType *dt) { derivedType_ = dt; return *this; } - std::size_t LenParameters() const; + RT_API_ATTRS std::size_t LenParameters() const; - typeInfo::TypeParameterValue LenParameterValue(int which) const { + RT_API_ATTRS typeInfo::TypeParameterValue LenParameterValue(int which) const { return len_[which]; } - static constexpr std::size_t SizeInBytes(int lenParameters) { + static constexpr RT_API_ATTRS std::size_t SizeInBytes(int lenParameters) { // TODO: Don't waste that last word if lenParameters == 0 return sizeof(DescriptorAddendum) + std::max(lenParameters - 1, 0) * sizeof(typeInfo::TypeParameterValue); } - std::size_t SizeInBytes() const; + RT_API_ATTRS std::size_t SizeInBytes() const; - void SetLenParameterValue(int which, typeInfo::TypeParameterValue x) { + RT_API_ATTRS void SetLenParameterValue( + int which, typeInfo::TypeParameterValue x) { len_[which] = x; } @@ -142,30 +152,34 @@ // Create() static member functions otherwise to dynamically allocate a // descriptor. - Descriptor(const Descriptor &); - Descriptor &operator=(const Descriptor &); + RT_API_ATTRS Descriptor(const Descriptor &); + RT_API_ATTRS Descriptor &operator=(const Descriptor &); // Returns the number of bytes occupied by an element of the given // category and kind including any alignment padding required // between adjacent elements. - static std::size_t BytesFor(TypeCategory category, int kind); + static RT_API_ATTRS std::size_t BytesFor(TypeCategory category, int kind); - void Establish(TypeCode t, std::size_t elementBytes, void *p = nullptr, - int rank = maxRank, const SubscriptValue *extent = nullptr, - ISO::CFI_attribute_t attribute = CFI_attribute_other, - bool addendum = false); - void Establish(TypeCategory, int kind, void *p = nullptr, int rank = maxRank, + RT_API_ATTRS void Establish(TypeCode t, std::size_t elementBytes, + void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other, bool addendum = false); - void Establish(int characterKind, std::size_t characters, void *p = nullptr, + RT_API_ATTRS void Establish(TypeCategory, int kind, void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other, bool addendum = false); - void Establish(const typeInfo::DerivedType &dt, void *p = nullptr, - int rank = maxRank, const SubscriptValue *extent = nullptr, + RT_API_ATTRS void Establish(int characterKind, std::size_t characters, + void *p = nullptr, int rank = maxRank, + const SubscriptValue *extent = nullptr, + ISO::CFI_attribute_t attribute = CFI_attribute_other, + bool addendum = false); + RT_API_ATTRS void Establish(const typeInfo::DerivedType &dt, + void *p = nullptr, int rank = maxRank, + const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other); + // CUDA_TODO: Clang does not support unique_ptr on device. static OwningPtr Create(TypeCode t, std::size_t elementBytes, void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, @@ -183,37 +197,40 @@ const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other); - ISO::CFI_cdesc_t &raw() { return raw_; } - const ISO::CFI_cdesc_t &raw() const { return raw_; } - std::size_t ElementBytes() const { return raw_.elem_len; } - int rank() const { return raw_.rank; } - TypeCode type() const { return TypeCode{raw_.type}; } + RT_API_ATTRS ISO::CFI_cdesc_t &raw() { return raw_; } + const RT_API_ATTRS ISO::CFI_cdesc_t &raw() const { return raw_; } + RT_API_ATTRS std::size_t ElementBytes() const { return raw_.elem_len; } + RT_API_ATTRS int rank() const { return raw_.rank; } + RT_API_ATTRS TypeCode type() const { return TypeCode{raw_.type}; } - Descriptor &set_base_addr(void *p) { + RT_API_ATTRS Descriptor &set_base_addr(void *p) { raw_.base_addr = p; return *this; } - bool IsPointer() const { return raw_.attribute == CFI_attribute_pointer; } - bool IsAllocatable() const { + RT_API_ATTRS bool IsPointer() const { + return raw_.attribute == CFI_attribute_pointer; + } + RT_API_ATTRS bool IsAllocatable() const { return raw_.attribute == CFI_attribute_allocatable; } - bool IsAllocated() const { return raw_.base_addr != nullptr; } + RT_API_ATTRS bool IsAllocated() const { return raw_.base_addr != nullptr; } - Dimension &GetDimension(int dim) { + RT_API_ATTRS Dimension &GetDimension(int dim) { return *reinterpret_cast(&raw_.dim[dim]); } - const Dimension &GetDimension(int dim) const { + const RT_API_ATTRS Dimension &GetDimension(int dim) const { return *reinterpret_cast(&raw_.dim[dim]); } - std::size_t SubscriptByteOffset( + RT_API_ATTRS std::size_t SubscriptByteOffset( int dim, SubscriptValue subscriptValue) const { const Dimension &dimension{GetDimension(dim)}; return (subscriptValue - dimension.LowerBound()) * dimension.ByteStride(); } - std::size_t SubscriptsToByteOffset(const SubscriptValue subscript[]) const { + RT_API_ATTRS std::size_t SubscriptsToByteOffset( + const SubscriptValue subscript[]) const { std::size_t offset{0}; for (int j{0}; j < raw_.rank; ++j) { offset += SubscriptByteOffset(j, subscript[j]); @@ -221,16 +238,19 @@ return offset; } - template A *OffsetElement(std::size_t offset = 0) const { + template + RT_API_ATTRS A *OffsetElement(std::size_t offset = 0) const { return reinterpret_cast( reinterpret_cast(raw_.base_addr) + offset); } - template A *Element(const SubscriptValue subscript[]) const { + template + RT_API_ATTRS A *Element(const SubscriptValue subscript[]) const { return OffsetElement(SubscriptsToByteOffset(subscript)); } - template A *ZeroBasedIndexedElement(std::size_t n) const { + template + RT_API_ATTRS A *ZeroBasedIndexedElement(std::size_t n) const { SubscriptValue at[maxRank]; if (SubscriptsForZeroBasedElementNumber(at, n)) { return Element(at); @@ -238,14 +258,14 @@ return nullptr; } - int GetLowerBounds(SubscriptValue subscript[]) const { + RT_API_ATTRS int GetLowerBounds(SubscriptValue subscript[]) const { for (int j{0}; j < raw_.rank; ++j) { subscript[j] = GetDimension(j).LowerBound(); } return raw_.rank; } - int GetShape(SubscriptValue subscript[]) const { + RT_API_ATTRS int GetShape(SubscriptValue subscript[]) const { for (int j{0}; j < raw_.rank; ++j) { subscript[j] = GetDimension(j).Extent(); } @@ -255,7 +275,7 @@ // When the passed subscript vector contains the last (or first) // subscripts of the array, these wrap the subscripts around to // their first (or last) values and return false. - bool IncrementSubscripts( + RT_API_ATTRS bool IncrementSubscripts( SubscriptValue subscript[], const int *permutation = nullptr) const { for (int j{0}; j < raw_.rank; ++j) { int k{permutation ? permutation[j] : j}; @@ -268,12 +288,13 @@ return false; } - bool DecrementSubscripts( + RT_API_ATTRS bool DecrementSubscripts( SubscriptValue[], const int *permutation = nullptr) const; // False when out of range. - bool SubscriptsForZeroBasedElementNumber(SubscriptValue subscript[], - std::size_t elementNumber, const int *permutation = nullptr) const { + RT_API_ATTRS bool SubscriptsForZeroBasedElementNumber( + SubscriptValue subscript[], std::size_t elementNumber, + const int *permutation = nullptr) const { if (raw_.rank == 0) { return elementNumber == 0; } @@ -301,17 +322,17 @@ return true; } - std::size_t ZeroBasedElementNumber( + RT_API_ATTRS std::size_t ZeroBasedElementNumber( const SubscriptValue *, const int *permutation = nullptr) const; - DescriptorAddendum *Addendum() { + RT_API_ATTRS DescriptorAddendum *Addendum() { if (raw_.f18Addendum != 0) { return reinterpret_cast(&GetDimension(rank())); } else { return nullptr; } } - const DescriptorAddendum *Addendum() const { + const RT_API_ATTRS DescriptorAddendum *Addendum() const { if (raw_.f18Addendum != 0) { return reinterpret_cast( &GetDimension(rank())); @@ -321,7 +342,7 @@ } // Returns size in bytes of the descriptor (not the data) - static constexpr std::size_t SizeInBytes( + static constexpr RT_API_ATTRS std::size_t SizeInBytes( int rank, bool addendum = false, int lengthTypeParameters = 0) { std::size_t bytes{sizeof(Descriptor) - sizeof(Dimension)}; bytes += rank * sizeof(Dimension); @@ -331,26 +352,26 @@ return bytes; } - std::size_t SizeInBytes() const; + RT_API_ATTRS std::size_t SizeInBytes() const; - std::size_t Elements() const; + RT_API_ATTRS std::size_t Elements() const; // Allocate() assumes Elements() and ElementBytes() work; // define the extents of the dimensions and the element length // before calling. It (re)computes the byte strides after // allocation. Does not allocate automatic components or // perform default component initialization. - int Allocate(); + RT_API_ATTRS int Allocate(); // Deallocates storage; does not call FINAL subroutines or // deallocate allocatable/automatic components. - int Deallocate(); + RT_API_ATTRS int Deallocate(); // Deallocates storage, including allocatable and automatic // components. Optionally invokes FINAL subroutines. - int Destroy(bool finalize = false, bool destroyPointers = false); + RT_API_ATTRS int Destroy(bool finalize = false, bool destroyPointers = false); - bool IsContiguous(int leadingDimensions = maxRank) const { + RT_API_ATTRS bool IsContiguous(int leadingDimensions = maxRank) const { auto bytes{static_cast(ElementBytes())}; if (leadingDimensions > raw_.rank) { leadingDimensions = raw_.rank; @@ -366,12 +387,12 @@ } // Establishes a pointer to a section or element. - bool EstablishPointerSection(const Descriptor &source, + RT_API_ATTRS bool EstablishPointerSection(const Descriptor &source, const SubscriptValue *lower = nullptr, const SubscriptValue *upper = nullptr, const SubscriptValue *stride = nullptr); - void Check() const; + RT_API_ATTRS void Check() const; void Dump(FILE * = stdout) const; @@ -398,12 +419,14 @@ static constexpr std::size_t byteSize{ Descriptor::SizeInBytes(maxRank, hasAddendum, maxLengthTypeParameters)}; - Descriptor &descriptor() { return *reinterpret_cast(storage_); } - const Descriptor &descriptor() const { + RT_API_ATTRS Descriptor &descriptor() { + return *reinterpret_cast(storage_); + } + const RT_API_ATTRS Descriptor &descriptor() const { return *reinterpret_cast(storage_); } - void Check() { + RT_API_ATTRS void Check() { assert(descriptor().rank() <= maxRank); assert(descriptor().SizeInBytes() <= byteSize); if (DescriptorAddendum * addendum{descriptor().Addendum()}) { Index: flang/include/flang/Runtime/entry-names.h =================================================================== --- flang/include/flang/Runtime/entry-names.h +++ flang/include/flang/Runtime/entry-names.h @@ -16,14 +16,29 @@ * The value of REVISION should not be changed until/unless the API to the * runtime library must change in some way that breaks backward compatibility. */ +#ifndef FORTRAN_RUNTIME_ENTRY_NAMES_H +#define FORTRAN_RUNTIME_ENTRY_NAMES_H + +#include "flang/Runtime/api-attrs.h" + #ifndef RTNAME #define NAME_WITH_PREFIX_AND_REVISION(prefix, revision, name) \ prefix##revision##name #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name) #endif +#ifndef RTDECL +#define RTDECL(name) RT_API_ATTRS RTNAME(name) +#endif + +#ifndef RTDEF +#define RTDEF(name) RT_API_ATTRS RTNAME(name) +#endif + #ifndef RTNAME_STRING #define RTNAME_STRINGIFY_(x) #x #define RTNAME_STRINGIFY(x) RTNAME_STRINGIFY_(x) #define RTNAME_STRING(name) RTNAME_STRINGIFY(RTNAME(name)) #endif + +#endif /* !FORTRAN_RUNTIME_ENTRY_NAMES_H */ Index: flang/include/flang/Runtime/float128.h =================================================================== --- flang/include/flang/Runtime/float128.h +++ flang/include/flang/Runtime/float128.h @@ -33,7 +33,7 @@ #undef HAS_FLOAT128 #if (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \ - !defined(_LIBCPP_VERSION) + !defined(_LIBCPP_VERSION) && !defined(__CUDA_ARCH__) /* * It may still be worth checking for compiler versions, * since earlier versions may define the macros above, but @@ -47,6 +47,6 @@ #define HAS_FLOAT128 1 #endif #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \ - !defined(_LIBCPP_VERSION) */ + !defined(_LIBCPP_VERSION) && !defined(__CUDA_ARCH__) */ #endif /* FORTRAN_RUNTIME_FLOAT128_H_ */ Index: flang/include/flang/Runtime/transformational.h =================================================================== --- flang/include/flang/Runtime/transformational.h +++ flang/include/flang/Runtime/transformational.h @@ -28,127 +28,127 @@ extern "C" { -void RTNAME(Reshape)(Descriptor &result, const Descriptor &source, +void RTDECL(Reshape)(Descriptor &result, const Descriptor &source, const Descriptor &shape, const Descriptor *pad = nullptr, const Descriptor *order = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, +void RTDECL(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn2, double bn2_1, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, long double x, long double bn2, long double bn2_1, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselJn_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJn_16)(Descriptor &result, int32_t n1, int32_t n2, CppFloat128Type x, CppFloat128Type bn2, CppFloat128Type bn2_1, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselJnX0_2)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_2)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_3)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_3)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselJnX0_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_16)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselYn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, +void RTDECL(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn1, double bn1_1, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, long double x, long double bn1, long double bn1_1, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselYn_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYn_16)(Descriptor &result, int32_t n1, int32_t n2, CppFloat128Type x, CppFloat128Type bn1, CppFloat128Type bn1_1, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselYnX0_2)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_2)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_3)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_3)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselYnX0_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_16)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(Cshift)(Descriptor &result, const Descriptor &source, +void RTDECL(Cshift)(Descriptor &result, const Descriptor &source, const Descriptor &shift, int dim = 1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(CshiftVector)(Descriptor &result, const Descriptor &source, +void RTDECL(CshiftVector)(Descriptor &result, const Descriptor &source, std::int64_t shift, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Eoshift)(Descriptor &result, const Descriptor &source, +void RTDECL(Eoshift)(Descriptor &result, const Descriptor &source, const Descriptor &shift, const Descriptor *boundary = nullptr, int dim = 1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(EoshiftVector)(Descriptor &result, const Descriptor &source, +void RTDECL(EoshiftVector)(Descriptor &result, const Descriptor &source, std::int64_t shift, const Descriptor *boundary = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Pack)(Descriptor &result, const Descriptor &source, +void RTDECL(Pack)(Descriptor &result, const Descriptor &source, const Descriptor &mask, const Descriptor *vector = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Spread)(Descriptor &result, const Descriptor &source, int dim, +void RTDECL(Spread)(Descriptor &result, const Descriptor &source, int dim, std::int64_t ncopies, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Transpose)(Descriptor &result, const Descriptor &matrix, +void RTDECL(Transpose)(Descriptor &result, const Descriptor &matrix, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Unpack)(Descriptor &result, const Descriptor &vector, +void RTDECL(Unpack)(Descriptor &result, const Descriptor &vector, const Descriptor &mask, const Descriptor &field, const char *sourceFile = nullptr, int line = 0); Index: flang/include/flang/Runtime/type-code.h =================================================================== --- flang/include/flang/Runtime/type-code.h +++ flang/include/flang/Runtime/type-code.h @@ -21,10 +21,10 @@ class TypeCode { public: TypeCode() {} - explicit TypeCode(ISO::CFI_type_t t) : raw_{t} {} - TypeCode(TypeCategory, int kind); + explicit RT_API_ATTRS TypeCode(ISO::CFI_type_t t) : raw_{t} {} + RT_API_ATTRS TypeCode(TypeCategory, int kind); - int raw() const { return raw_; } + RT_API_ATTRS int raw() const { return raw_; } constexpr bool IsValid() const { return raw_ >= CFI_type_signed_char && raw_ <= CFI_TYPE_LAST; @@ -50,9 +50,12 @@ constexpr bool IsDerived() const { return raw_ == CFI_type_struct; } constexpr bool IsIntrinsic() const { return IsValid() && !IsDerived(); } - std::optional> GetCategoryAndKind() const; + RT_API_ATTRS std::optional> + GetCategoryAndKind() const; - bool operator==(const TypeCode &that) const { return raw_ == that.raw_; } + RT_API_ATTRS bool operator==(const TypeCode &that) const { + return raw_ == that.raw_; + } bool operator!=(const TypeCode &that) const { return raw_ != that.raw_; } private: Index: flang/runtime/CMakeLists.txt =================================================================== --- flang/runtime/CMakeLists.txt +++ flang/runtime/CMakeLists.txt @@ -84,7 +84,7 @@ add_subdirectory(FortranMain) -add_flang_library(FortranRuntime +set(sources ISO_Fortran_binding.cpp allocatable.cpp array-constructor.cpp @@ -142,7 +142,112 @@ unit.cpp unit-map.cpp utf.cpp +) + +option(FLANG_EXPERIMENTAL_CUDA_RUNTIME + "Compile Fortran runtime as CUDA sources (experimental)" OFF + ) +# List of files that are buildable for all devices. +set(supported_files + transformational.cpp + ) + +if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) + enable_language(CUDA) + + # Add the unsupported files to LLVM_OPTIONAL_SOURCES. + set(todo_files ${sources}) + list(REMOVE_ITEM todo_files ${supported_files}) + list(APPEND LLVM_OPTIONAL_SOURCES ${todo_files}) + + # TODO: figure out how to make target property CUDA_SEPARABLE_COMPILATION + # work, and avoid setting CMAKE_CUDA_SEPARABLE_COMPILATION. + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + + # Treat all sources as CUDA files. + set(sources ${supported_files}) + set_source_files_properties(${sources} PROPERTIES LANGUAGE CUDA) + if ("${CMAKE_CUDA_COMPILER_ID}" MATCHES "Clang") + # Allow varargs. + add_compile_options(-Xclang -fcuda-allow-variadic-functions) + endif() +endif() + +set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING + "Compile Fortran runtime as OpenMP target offload sources (experimental). Valid options are 'off', 'host_device', 'nohost'") + +set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING + "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')") + +if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "off") + # 'host_device' build only works with Clang compiler currently. + # The build is done with the CMAKE_C/CXX_COMPILER, i.e. it does not use + # the in-tree built Clang. We may have a mode that would use the in-tree + # built Clang. + # + # 'nohost' is supposed to produce an LLVM Bitcode library, + # and it has to be done with a C/C++ compiler producing LLVM Bitcode + # compatible with the LLVM toolchain version distributed with the Flang + # compiler. + # In general, the in-tree built Clang should be used for 'nohost' build. + # Note that 'nohost' build does not produce the host version of Flang + # runtime library, so there will be two separate distributable objects. + # 'nohost' build is a TODO. + + if (NOT FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD STREQUAL "host_device") + message(FATAL_ERROR "Unsupported OpenMP offload build of Flang runtime") + endif() + + # Add the unsupported files to LLVM_OPTIONAL_SOURCES. + set(todo_files ${sources}) + list(REMOVE_ITEM todo_files ${supported_files}) + list(APPEND LLVM_OPTIONAL_SOURCES ${todo_files}) + set(sources ${supported_files}) + + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang" AND + "${CMAKE_C_COMPILER_ID}" MATCHES "Clang") + + set(all_amdgpu_architectures + "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" + "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" + "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" + "gfx1100;gfx1101;gfx1102;gfx1103" + ) + set(all_nvptx_architectures + "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" + "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90" + ) + set(all_gpu_architectures + "${all_amdgpu_architectures};${all_nvptx_architectures}" + ) + # TODO: support auto detection on the build system. + if (FLANG_OMP_DEVICE_ARCHITECTURES STREQUAL "all") + set(FLANG_OMP_DEVICE_ARCHITECTURES ${all_gpu_architectures}) + endif() + list(REMOVE_DUPLICATES FLANG_OMP_DEVICE_ARCHITECTURES) + + string(REPLACE ";" "," compile_for_architectures + "${FLANG_OMP_DEVICE_ARCHITECTURES}" + ) + + add_compile_options(-fopenmp -fvisibility=hidden -fopenmp-cuda-mode) + add_compile_options(--offload-arch=${compile_for_architectures}) + # Force LTO for the device part. + add_compile_options(-foffload-lto) + else() + message(FATAL_ERROR + "Flang runtime build is not supported for these compilers:\n" + "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}\n" + "CMAKE_C_COMPILER_ID: ${CMAKE_C_COMPILER_ID}") + endif() + + # Enable "declare target" in the source code. + add_compile_definitions(OMP_OFFLOAD_BUILD) +endif() + +add_flang_library(FortranRuntime + ${sources} LINK_LIBS FortranDecimal Index: flang/runtime/copy.h =================================================================== --- flang/runtime/copy.h +++ flang/runtime/copy.h @@ -18,11 +18,12 @@ // Assigns to uninitialized storage. // Duplicates allocatable & automatic components. -void CopyElement(const Descriptor &to, const SubscriptValue toAt[], +RT_API_ATTRS void CopyElement(const Descriptor &to, const SubscriptValue toAt[], const Descriptor &from, const SubscriptValue fromAt[], Terminator &); // Copies data from one allocated descriptor's array to another. -void CopyArray(const Descriptor &to, const Descriptor &from, Terminator &); +RT_API_ATTRS void CopyArray( + const Descriptor &to, const Descriptor &from, Terminator &); } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_COPY_H_ Index: flang/runtime/freestanding-tools.h =================================================================== --- /dev/null +++ flang/runtime/freestanding-tools.h @@ -0,0 +1,43 @@ +//===-- runtime/freestanding-tools.h ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_ +#define FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_ + +#include "flang/Runtime/api-attrs.h" +#include + +// The file defines a set of utilities/classes that might be +// used to get reduce the dependency on external libraries (e.g. libstdc++). + +#if !defined(STD_FILL_N_UNSUPPORTED) && \ + (defined(__CUDACC__) || defined(__CUDA__)) && defined(__CUDA_ARCH__) +#define STD_FILL_N_UNSUPPORTED 1 +#endif + +namespace Fortran::runtime { + +#if STD_FILL_N_UNSUPPORTED +// Provides alternative implementation for std::fill_n(), if +// it is not supported. +template +static inline RT_API_ATTRS void fill_n( + A *start, std::size_t count, const A &value) { +#if STD_FILL_N_UNSUPPORTED + for (std::size_t j{0}; j < count; ++j) + start[j] = value; +#else + std::fill_n(start, count, value); +#endif +} +#else // !STD_FILL_N_UNSUPPORTED +using std::fill_n; +#endif // !STD_FILL_N_UNSUPPORTED + +} // namespace Fortran::runtime +#endif // FORTRAN_RUNTIME_FREESTANDING_TOOLS_H_ Index: flang/runtime/terminator.h =================================================================== --- flang/runtime/terminator.h +++ flang/runtime/terminator.h @@ -11,6 +11,7 @@ #ifndef FORTRAN_RUNTIME_TERMINATOR_H_ #define FORTRAN_RUNTIME_TERMINATOR_H_ +#include "flang/Runtime/api-attrs.h" #include namespace Fortran::runtime { @@ -21,7 +22,8 @@ public: Terminator() {} Terminator(const Terminator &) = default; - explicit Terminator(const char *sourceFileName, int sourceLine = 0) + explicit RT_API_ATTRS Terminator( + const char *sourceFileName, int sourceLine = 0) : sourceFileName_{sourceFileName}, sourceLine_{sourceLine} {} const char *sourceFileName() const { return sourceFileName_; } @@ -31,11 +33,16 @@ sourceFileName_ = sourceFileName; sourceLine_ = sourceLine; } - [[noreturn]] void Crash(const char *message, ...) const; - [[noreturn]] void CrashArgs(const char *message, va_list &) const; - [[noreturn]] void CheckFailed( + + // CUDA_TODO: Clang for CUDA does not support varargs, though + // it compiles it with -fcuda-allow-variadic-functions. + // We can try to replace varargs functions with variadic templates. + [[noreturn]] RT_API_ATTRS void Crash(const char *message, ...) const; + [[noreturn]] RT_API_ATTRS void CrashArgs( + const char *message, va_list &) const; + [[noreturn]] RT_API_ATTRS void CheckFailed( const char *predicate, const char *file, int line) const; - [[noreturn]] void CheckFailed(const char *predicate) const; + [[noreturn]] RT_API_ATTRS void CheckFailed(const char *predicate) const; // For test harnessing - overrides CrashArgs(). static void RegisterCrashHandler(void (*)(const char *sourceFile, Index: flang/runtime/tools.h =================================================================== --- flang/runtime/tools.h +++ flang/runtime/tools.h @@ -9,6 +9,7 @@ #ifndef FORTRAN_RUNTIME_TOOLS_H_ #define FORTRAN_RUNTIME_TOOLS_H_ +#include "freestanding-tools.h" #include "terminator.h" #include "flang/Runtime/cpp-type.h" #include "flang/Runtime/descriptor.h" @@ -39,7 +40,7 @@ char *to, std::size_t toLength, const char *from); // Utility for dealing with elemental LOGICAL arguments -inline bool IsLogicalElementTrue( +inline RT_API_ATTRS bool IsLogicalElementTrue( const Descriptor &logical, const SubscriptValue at[]) { // A LOGICAL value is false if and only if all of its bytes are zero. const char *p{logical.Element(at)}; @@ -52,7 +53,7 @@ } // Check array conformability; a scalar 'x' conforms. Crashes on error. -void CheckConformability(const Descriptor &to, const Descriptor &x, +RT_API_ATTRS void CheckConformability(const Descriptor &to, const Descriptor &x, Terminator &, const char *funcName, const char *toName, const char *fromName); @@ -66,7 +67,8 @@ }; // Validate a KIND= argument -void CheckIntegerKind(Terminator &, int kind, const char *intrinsic); +RT_API_ATTRS void CheckIntegerKind( + Terminator &, int kind, const char *intrinsic); template inline void PutContiguousConverted(TO *to, FROM *from, std::size_t count) { @@ -75,7 +77,7 @@ } } -static inline std::int64_t GetInt64( +static inline RT_API_ATTRS std::int64_t GetInt64( const char *p, std::size_t bytes, Terminator &terminator) { switch (bytes) { case 1: @@ -116,7 +118,7 @@ // arguments. template