Index: flang/include/flang/ISO_Fortran_binding.h =================================================================== --- flang/include/flang/ISO_Fortran_binding.h +++ flang/include/flang/ISO_Fortran_binding.h @@ -18,6 +18,8 @@ * implementation. */ +#include "Runtime/device.h" + #ifdef __cplusplus namespace Fortran { namespace ISO { @@ -121,8 +123,10 @@ // care of getting the memory storage. Note that it already contains one element // because a struct cannot be empty. template struct FlexibleArray : T { - T &operator[](int index) { return *(this + index); } - const T &operator[](int index) const { return *(this + index); } + HOST_DEVICE_ATTRS T &operator[](int index) { return *(this + index); } + const HOST_DEVICE_ATTRS T &operator[](int index) const { + return *(this + index); + } operator T *() { return this; } operator const T *() const { return this; } }; @@ -174,11 +178,11 @@ void *CFI_address(const CFI_cdesc_t *, const CFI_index_t subscripts[]); int CFI_allocate(CFI_cdesc_t *, const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[], size_t elem_len); -int CFI_deallocate(CFI_cdesc_t *); +HOST_DEVICE_ATTRS int CFI_deallocate(CFI_cdesc_t *); int CFI_establish(CFI_cdesc_t *, void *base_addr, CFI_attribute_t, CFI_type_t, size_t elem_len, CFI_rank_t, const CFI_index_t extents[]); int CFI_is_contiguous(const CFI_cdesc_t *); -int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source, +HOST_DEVICE_ATTRS int CFI_section(CFI_cdesc_t *, const CFI_cdesc_t *source, const CFI_index_t lower_bounds[], const CFI_index_t upper_bounds[], const CFI_index_t strides[]); int CFI_select_part(CFI_cdesc_t *, const CFI_cdesc_t *source, Index: flang/include/flang/Runtime/descriptor.h =================================================================== --- flang/include/flang/Runtime/descriptor.h +++ flang/include/flang/Runtime/descriptor.h @@ -18,6 +18,8 @@ // User C code is welcome to depend on that ISO_Fortran_binding.h file, // but should never reference this internal header. +#include "flang/Runtime/device.h" + #include "flang/ISO_Fortran_binding.h" #include "flang/Runtime/memory.h" #include "flang/Runtime/type-code.h" @@ -44,12 +46,17 @@ class Dimension { public: - SubscriptValue LowerBound() const { return raw_.lower_bound; } - SubscriptValue Extent() const { return raw_.extent; } - SubscriptValue UpperBound() const { return LowerBound() + Extent() - 1; } - SubscriptValue ByteStride() const { return raw_.sm; } + HOST_DEVICE_ATTRS SubscriptValue LowerBound() const { + return raw_.lower_bound; + } + HOST_DEVICE_ATTRS SubscriptValue Extent() const { return raw_.extent; } + HOST_DEVICE_ATTRS SubscriptValue UpperBound() const { + return LowerBound() + Extent() - 1; + } + HOST_DEVICE_ATTRS SubscriptValue ByteStride() const { return raw_.sm; } - Dimension &SetBounds(SubscriptValue lower, SubscriptValue upper) { + HOST_DEVICE_ATTRS Dimension &SetBounds( + SubscriptValue lower, SubscriptValue upper) { if (upper >= lower) { raw_.lower_bound = lower; raw_.extent = upper - lower + 1; @@ -74,7 +81,7 @@ raw_.extent = extent; return *this; } - Dimension &SetByteStride(SubscriptValue bytes) { + HOST_DEVICE_ATTRS Dimension &SetByteStride(SubscriptValue bytes) { raw_.sm = bytes; return *this; } @@ -91,29 +98,36 @@ // array is determined by derivedType_->LenParameters(). class DescriptorAddendum { public: - explicit DescriptorAddendum(const typeInfo::DerivedType *dt = nullptr) + explicit HOST_DEVICE_ATTRS DescriptorAddendum( + const typeInfo::DerivedType *dt = nullptr) : derivedType_{dt} {} - DescriptorAddendum &operator=(const DescriptorAddendum &); + HOST_DEVICE_ATTRS DescriptorAddendum &operator=(const DescriptorAddendum &); - const typeInfo::DerivedType *derivedType() const { return derivedType_; } - DescriptorAddendum &set_derivedType(const typeInfo::DerivedType *dt) { + const HOST_DEVICE_ATTRS typeInfo::DerivedType *derivedType() const { + return derivedType_; + } + HOST_DEVICE_ATTRS DescriptorAddendum &set_derivedType( + const typeInfo::DerivedType *dt) { derivedType_ = dt; return *this; } - std::size_t LenParameters() const; + HOST_DEVICE_ATTRS std::size_t LenParameters() const; - typeInfo::TypeParameterValue LenParameterValue(int which) const { + HOST_DEVICE_ATTRS typeInfo::TypeParameterValue LenParameterValue( + int which) const { return len_[which]; } - static constexpr std::size_t SizeInBytes(int lenParameters) { + static constexpr HOST_DEVICE_ATTRS std::size_t SizeInBytes( + int lenParameters) { // TODO: Don't waste that last word if lenParameters == 0 return sizeof(DescriptorAddendum) + std::max(lenParameters - 1, 0) * sizeof(typeInfo::TypeParameterValue); } - std::size_t SizeInBytes() const; + HOST_DEVICE_ATTRS std::size_t SizeInBytes() const; - void SetLenParameterValue(int which, typeInfo::TypeParameterValue x) { + HOST_DEVICE_ATTRS void SetLenParameterValue( + int which, typeInfo::TypeParameterValue x) { len_[which] = x; } @@ -142,30 +156,35 @@ // Create() static member functions otherwise to dynamically allocate a // descriptor. - Descriptor(const Descriptor &); - Descriptor &operator=(const Descriptor &); + HOST_DEVICE_ATTRS Descriptor(const Descriptor &); + HOST_DEVICE_ATTRS Descriptor &operator=(const Descriptor &); // Returns the number of bytes occupied by an element of the given // category and kind including any alignment padding required // between adjacent elements. - static std::size_t BytesFor(TypeCategory category, int kind); + static HOST_DEVICE_ATTRS std::size_t BytesFor( + TypeCategory category, int kind); - void Establish(TypeCode t, std::size_t elementBytes, void *p = nullptr, - int rank = maxRank, const SubscriptValue *extent = nullptr, - ISO::CFI_attribute_t attribute = CFI_attribute_other, - bool addendum = false); - void Establish(TypeCategory, int kind, void *p = nullptr, int rank = maxRank, + HOST_DEVICE_ATTRS void Establish(TypeCode t, std::size_t elementBytes, + void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other, bool addendum = false); - void Establish(int characterKind, std::size_t characters, void *p = nullptr, + HOST_DEVICE_ATTRS void Establish(TypeCategory, int kind, void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other, bool addendum = false); - void Establish(const typeInfo::DerivedType &dt, void *p = nullptr, - int rank = maxRank, const SubscriptValue *extent = nullptr, + HOST_DEVICE_ATTRS void Establish(int characterKind, std::size_t characters, + void *p = nullptr, int rank = maxRank, + const SubscriptValue *extent = nullptr, + ISO::CFI_attribute_t attribute = CFI_attribute_other, + bool addendum = false); + HOST_DEVICE_ATTRS void Establish(const typeInfo::DerivedType &dt, + void *p = nullptr, int rank = maxRank, + const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other); + // CUDA_TODO: Clang does not support unique_ptr on device. static OwningPtr Create(TypeCode t, std::size_t elementBytes, void *p = nullptr, int rank = maxRank, const SubscriptValue *extent = nullptr, @@ -183,37 +202,42 @@ const SubscriptValue *extent = nullptr, ISO::CFI_attribute_t attribute = CFI_attribute_other); - ISO::CFI_cdesc_t &raw() { return raw_; } - const ISO::CFI_cdesc_t &raw() const { return raw_; } - std::size_t ElementBytes() const { return raw_.elem_len; } - int rank() const { return raw_.rank; } - TypeCode type() const { return TypeCode{raw_.type}; } + HOST_DEVICE_ATTRS ISO::CFI_cdesc_t &raw() { return raw_; } + const HOST_DEVICE_ATTRS ISO::CFI_cdesc_t &raw() const { return raw_; } + HOST_DEVICE_ATTRS std::size_t ElementBytes() const { return raw_.elem_len; } + HOST_DEVICE_ATTRS int rank() const { return raw_.rank; } + HOST_DEVICE_ATTRS TypeCode type() const { return TypeCode{raw_.type}; } - Descriptor &set_base_addr(void *p) { + HOST_DEVICE_ATTRS Descriptor &set_base_addr(void *p) { raw_.base_addr = p; return *this; } - bool IsPointer() const { return raw_.attribute == CFI_attribute_pointer; } - bool IsAllocatable() const { + HOST_DEVICE_ATTRS bool IsPointer() const { + return raw_.attribute == CFI_attribute_pointer; + } + HOST_DEVICE_ATTRS bool IsAllocatable() const { return raw_.attribute == CFI_attribute_allocatable; } - bool IsAllocated() const { return raw_.base_addr != nullptr; } + HOST_DEVICE_ATTRS bool IsAllocated() const { + return raw_.base_addr != nullptr; + } - Dimension &GetDimension(int dim) { + HOST_DEVICE_ATTRS Dimension &GetDimension(int dim) { return *reinterpret_cast(&raw_.dim[dim]); } - const Dimension &GetDimension(int dim) const { + const HOST_DEVICE_ATTRS Dimension &GetDimension(int dim) const { return *reinterpret_cast(&raw_.dim[dim]); } - std::size_t SubscriptByteOffset( + HOST_DEVICE_ATTRS std::size_t SubscriptByteOffset( int dim, SubscriptValue subscriptValue) const { const Dimension &dimension{GetDimension(dim)}; return (subscriptValue - dimension.LowerBound()) * dimension.ByteStride(); } - std::size_t SubscriptsToByteOffset(const SubscriptValue subscript[]) const { + HOST_DEVICE_ATTRS std::size_t SubscriptsToByteOffset( + const SubscriptValue subscript[]) const { std::size_t offset{0}; for (int j{0}; j < raw_.rank; ++j) { offset += SubscriptByteOffset(j, subscript[j]); @@ -221,16 +245,19 @@ return offset; } - template A *OffsetElement(std::size_t offset = 0) const { + template + HOST_DEVICE_ATTRS A *OffsetElement(std::size_t offset = 0) const { return reinterpret_cast( reinterpret_cast(raw_.base_addr) + offset); } - template A *Element(const SubscriptValue subscript[]) const { + template + HOST_DEVICE_ATTRS A *Element(const SubscriptValue subscript[]) const { return OffsetElement(SubscriptsToByteOffset(subscript)); } - template A *ZeroBasedIndexedElement(std::size_t n) const { + template + HOST_DEVICE_ATTRS A *ZeroBasedIndexedElement(std::size_t n) const { SubscriptValue at[maxRank]; if (SubscriptsForZeroBasedElementNumber(at, n)) { return Element(at); @@ -238,14 +265,14 @@ return nullptr; } - int GetLowerBounds(SubscriptValue subscript[]) const { + HOST_DEVICE_ATTRS int GetLowerBounds(SubscriptValue subscript[]) const { for (int j{0}; j < raw_.rank; ++j) { subscript[j] = GetDimension(j).LowerBound(); } return raw_.rank; } - int GetShape(SubscriptValue subscript[]) const { + HOST_DEVICE_ATTRS int GetShape(SubscriptValue subscript[]) const { for (int j{0}; j < raw_.rank; ++j) { subscript[j] = GetDimension(j).Extent(); } @@ -255,7 +282,7 @@ // When the passed subscript vector contains the last (or first) // subscripts of the array, these wrap the subscripts around to // their first (or last) values and return false. - bool IncrementSubscripts( + HOST_DEVICE_ATTRS bool IncrementSubscripts( SubscriptValue subscript[], const int *permutation = nullptr) const { for (int j{0}; j < raw_.rank; ++j) { int k{permutation ? permutation[j] : j}; @@ -268,12 +295,13 @@ return false; } - bool DecrementSubscripts( + HOST_DEVICE_ATTRS bool DecrementSubscripts( SubscriptValue[], const int *permutation = nullptr) const; // False when out of range. - bool SubscriptsForZeroBasedElementNumber(SubscriptValue subscript[], - std::size_t elementNumber, const int *permutation = nullptr) const { + HOST_DEVICE_ATTRS bool SubscriptsForZeroBasedElementNumber( + SubscriptValue subscript[], std::size_t elementNumber, + const int *permutation = nullptr) const { if (raw_.rank == 0) { return elementNumber == 0; } @@ -301,17 +329,17 @@ return true; } - std::size_t ZeroBasedElementNumber( + HOST_DEVICE_ATTRS std::size_t ZeroBasedElementNumber( const SubscriptValue *, const int *permutation = nullptr) const; - DescriptorAddendum *Addendum() { + HOST_DEVICE_ATTRS DescriptorAddendum *Addendum() { if (raw_.f18Addendum != 0) { return reinterpret_cast(&GetDimension(rank())); } else { return nullptr; } } - const DescriptorAddendum *Addendum() const { + const HOST_DEVICE_ATTRS DescriptorAddendum *Addendum() const { if (raw_.f18Addendum != 0) { return reinterpret_cast( &GetDimension(rank())); @@ -321,7 +349,7 @@ } // Returns size in bytes of the descriptor (not the data) - static constexpr std::size_t SizeInBytes( + static constexpr HOST_DEVICE_ATTRS std::size_t SizeInBytes( int rank, bool addendum = false, int lengthTypeParameters = 0) { std::size_t bytes{sizeof(Descriptor) - sizeof(Dimension)}; bytes += rank * sizeof(Dimension); @@ -331,26 +359,27 @@ return bytes; } - std::size_t SizeInBytes() const; + HOST_DEVICE_ATTRS std::size_t SizeInBytes() const; - std::size_t Elements() const; + HOST_DEVICE_ATTRS std::size_t Elements() const; // Allocate() assumes Elements() and ElementBytes() work; // define the extents of the dimensions and the element length // before calling. It (re)computes the byte strides after // allocation. Does not allocate automatic components or // perform default component initialization. - int Allocate(); + HOST_DEVICE_ATTRS int Allocate(); // Deallocates storage; does not call FINAL subroutines or // deallocate allocatable/automatic components. - int Deallocate(); + HOST_DEVICE_ATTRS int Deallocate(); // Deallocates storage, including allocatable and automatic // components. Optionally invokes FINAL subroutines. - int Destroy(bool finalize = false, bool destroyPointers = false); + HOST_DEVICE_ATTRS int Destroy( + bool finalize = false, bool destroyPointers = false); - bool IsContiguous(int leadingDimensions = maxRank) const { + HOST_DEVICE_ATTRS bool IsContiguous(int leadingDimensions = maxRank) const { auto bytes{static_cast(ElementBytes())}; if (leadingDimensions > raw_.rank) { leadingDimensions = raw_.rank; @@ -366,12 +395,12 @@ } // Establishes a pointer to a section or element. - bool EstablishPointerSection(const Descriptor &source, + HOST_DEVICE_ATTRS bool EstablishPointerSection(const Descriptor &source, const SubscriptValue *lower = nullptr, const SubscriptValue *upper = nullptr, const SubscriptValue *stride = nullptr); - void Check() const; + HOST_DEVICE_ATTRS void Check() const; void Dump(FILE * = stdout) const; @@ -398,12 +427,14 @@ static constexpr std::size_t byteSize{ Descriptor::SizeInBytes(maxRank, hasAddendum, maxLengthTypeParameters)}; - Descriptor &descriptor() { return *reinterpret_cast(storage_); } - const Descriptor &descriptor() const { + HOST_DEVICE_ATTRS Descriptor &descriptor() { + return *reinterpret_cast(storage_); + } + const HOST_DEVICE_ATTRS Descriptor &descriptor() const { return *reinterpret_cast(storage_); } - void Check() { + HOST_DEVICE_ATTRS void Check() { assert(descriptor().rank() <= maxRank); assert(descriptor().SizeInBytes() <= byteSize); if (DescriptorAddendum * addendum{descriptor().Addendum()}) { Index: flang/include/flang/Runtime/device.h =================================================================== --- /dev/null +++ flang/include/flang/Runtime/device.h @@ -0,0 +1,24 @@ +/*===-- include/flang/Runtime/device.h ------------------------------*- C -*-=// + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef FORTRAN_RUNTIME_DEVICE_H +#define FORTRAN_RUNTIME_DEVICE_H + +#ifdef __CUDA__ +#define HOST_DEVICE_ATTRS __host__ __device__ + +#ifdef __CUDA_ARCH__ +#define CUDA_DEVICE 1 +#endif + +#else +#define HOST_DEVICE_ATTRS +#endif + +#endif /* !FORTRAN_RUNTIME_DEVICE_H */ Index: flang/include/flang/Runtime/entry-names.h =================================================================== --- flang/include/flang/Runtime/entry-names.h +++ flang/include/flang/Runtime/entry-names.h @@ -16,14 +16,25 @@ * The value of REVISION should not be changed until/unless the API to the * runtime library must change in some way that breaks backward compatibility. */ +#ifndef FORTRAN_RUNTIME_ENTRY_NAMES_H +#define FORTRAN_RUNTIME_ENTRY_NAMES_H + +#include "flang/Runtime/device.h" + #ifndef RTNAME #define NAME_WITH_PREFIX_AND_REVISION(prefix, revision, name) \ prefix##revision##name #define RTNAME(name) NAME_WITH_PREFIX_AND_REVISION(_Fortran, A, name) #endif +#ifndef RTDECL +#define RTDECL(name) HOST_DEVICE_ATTRS RTNAME(name) +#endif + #ifndef RTNAME_STRING #define RTNAME_STRINGIFY_(x) #x #define RTNAME_STRINGIFY(x) RTNAME_STRINGIFY_(x) #define RTNAME_STRING(name) RTNAME_STRINGIFY(RTNAME(name)) #endif + +#endif /* !FORTRAN_RUNTIME_ENTRY_NAMES_H */ Index: flang/include/flang/Runtime/float128.h =================================================================== --- flang/include/flang/Runtime/float128.h +++ flang/include/flang/Runtime/float128.h @@ -33,7 +33,7 @@ #undef HAS_FLOAT128 #if (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \ - !defined(_LIBCPP_VERSION) + !defined(_LIBCPP_VERSION) && !defined(__CUDA_ARCH__) /* * It may still be worth checking for compiler versions, * since earlier versions may define the macros above, but @@ -47,6 +47,6 @@ #define HAS_FLOAT128 1 #endif #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \ - !defined(_LIBCPP_VERSION) */ + !defined(_LIBCPP_VERSION) && !defined(__CUDA_ARCH__) */ #endif /* FORTRAN_RUNTIME_FLOAT128_H_ */ Index: flang/include/flang/Runtime/transformational.h =================================================================== --- flang/include/flang/Runtime/transformational.h +++ flang/include/flang/Runtime/transformational.h @@ -28,127 +28,127 @@ extern "C" { -void RTNAME(Reshape)(Descriptor &result, const Descriptor &source, +void RTDECL(Reshape)(Descriptor &result, const Descriptor &source, const Descriptor &shape, const Descriptor *pad = nullptr, const Descriptor *order = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselJn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn2, float bn2_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, +void RTDECL(BesselJn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn2, double bn2_1, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJn_10)(Descriptor &result, int32_t n1, int32_t n2, long double x, long double bn2, long double bn2_1, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselJn_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJn_16)(Descriptor &result, int32_t n1, int32_t n2, CppFloat128Type x, CppFloat128Type bn2, CppFloat128Type bn2_1, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselJnX0_2)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_2)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_3)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_3)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_4)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselJnX0_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselJnX0_16)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselYn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_2)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_3)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, +void RTDECL(BesselYn_4)(Descriptor &result, int32_t n1, int32_t n2, float x, float bn1, float bn1_1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, +void RTDECL(BesselYn_8)(Descriptor &result, int32_t n1, int32_t n2, double x, double bn1, double bn1_1, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYn_10)(Descriptor &result, int32_t n1, int32_t n2, long double x, long double bn1, long double bn1_1, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselYn_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYn_16)(Descriptor &result, int32_t n1, int32_t n2, CppFloat128Type x, CppFloat128Type bn1, CppFloat128Type bn1_1, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(BesselYnX0_2)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_2)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_3)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_3)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_4)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); -void RTNAME(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_8)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #if LDBL_MANT_DIG == 64 -void RTNAME(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_10)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif #if LDBL_MANT_DIG == 113 || HAS_FLOAT128 -void RTNAME(BesselYnX0_16)(Descriptor &result, int32_t n1, int32_t n2, +void RTDECL(BesselYnX0_16)(Descriptor &result, int32_t n1, int32_t n2, const char *sourceFile = nullptr, int line = 0); #endif -void RTNAME(Cshift)(Descriptor &result, const Descriptor &source, +void RTDECL(Cshift)(Descriptor &result, const Descriptor &source, const Descriptor &shift, int dim = 1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(CshiftVector)(Descriptor &result, const Descriptor &source, +void RTDECL(CshiftVector)(Descriptor &result, const Descriptor &source, std::int64_t shift, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Eoshift)(Descriptor &result, const Descriptor &source, +void RTDECL(Eoshift)(Descriptor &result, const Descriptor &source, const Descriptor &shift, const Descriptor *boundary = nullptr, int dim = 1, const char *sourceFile = nullptr, int line = 0); -void RTNAME(EoshiftVector)(Descriptor &result, const Descriptor &source, +void RTDECL(EoshiftVector)(Descriptor &result, const Descriptor &source, std::int64_t shift, const Descriptor *boundary = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Pack)(Descriptor &result, const Descriptor &source, +void RTDECL(Pack)(Descriptor &result, const Descriptor &source, const Descriptor &mask, const Descriptor *vector = nullptr, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Spread)(Descriptor &result, const Descriptor &source, int dim, +void RTDECL(Spread)(Descriptor &result, const Descriptor &source, int dim, std::int64_t ncopies, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Transpose)(Descriptor &result, const Descriptor &matrix, +void RTDECL(Transpose)(Descriptor &result, const Descriptor &matrix, const char *sourceFile = nullptr, int line = 0); -void RTNAME(Unpack)(Descriptor &result, const Descriptor &vector, +void RTDECL(Unpack)(Descriptor &result, const Descriptor &vector, const Descriptor &mask, const Descriptor &field, const char *sourceFile = nullptr, int line = 0); Index: flang/include/flang/Runtime/type-code.h =================================================================== --- flang/include/flang/Runtime/type-code.h +++ flang/include/flang/Runtime/type-code.h @@ -21,10 +21,10 @@ class TypeCode { public: TypeCode() {} - explicit TypeCode(ISO::CFI_type_t t) : raw_{t} {} - TypeCode(TypeCategory, int kind); + explicit HOST_DEVICE_ATTRS TypeCode(ISO::CFI_type_t t) : raw_{t} {} + HOST_DEVICE_ATTRS TypeCode(TypeCategory, int kind); - int raw() const { return raw_; } + HOST_DEVICE_ATTRS int raw() const { return raw_; } constexpr bool IsValid() const { return raw_ >= CFI_type_signed_char && raw_ <= CFI_TYPE_LAST; @@ -50,9 +50,12 @@ constexpr bool IsDerived() const { return raw_ == CFI_type_struct; } constexpr bool IsIntrinsic() const { return IsValid() && !IsDerived(); } - std::optional> GetCategoryAndKind() const; + HOST_DEVICE_ATTRS std::optional> + GetCategoryAndKind() const; - bool operator==(const TypeCode &that) const { return raw_ == that.raw_; } + HOST_DEVICE_ATTRS bool operator==(const TypeCode &that) const { + return raw_ == that.raw_; + } bool operator!=(const TypeCode &that) const { return raw_ != that.raw_; } private: Index: flang/runtime/CMakeLists.txt =================================================================== --- flang/runtime/CMakeLists.txt +++ flang/runtime/CMakeLists.txt @@ -16,7 +16,15 @@ "at least 3.20.0 now to avoid issues in the future!") endif() - project(FlangRuntime C CXX) + option(FLANG_EXPERIMENTAL_CUDA_RUNTIME + "Compile Fortran runtime as CUDA sources (experimental)" OFF + ) + + if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) + set(enable_cuda_lang CUDA) + endif() + + project(FlangRuntime C CXX ${enable_cuda_lang}) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED TRUE) @@ -57,6 +65,11 @@ include(CheckCXXSymbolExists) include(CheckCXXSourceCompiles) +# TODO: in CUDA build mode these checks should be done +# for the device side as well, and probably instead +# of failing during CMake configuration the corresponding +# functionality has to be disabled/customized for the device +# compilation. check_cxx_symbol_exists(strerror string.h HAVE_STRERROR) check_cxx_symbol_exists(strerror_r string.h HAVE_STRERROR_R) # Can't use symbol exists here as the function is overloaded in C++ @@ -96,7 +109,7 @@ add_subdirectory(FortranMain) -add_flang_library(FortranRuntime +set(sources ISO_Fortran_binding.cpp allocatable.cpp array-constructor.cpp @@ -154,7 +167,30 @@ unit.cpp unit-map.cpp utf.cpp +) +if (FLANG_EXPERIMENTAL_CUDA_RUNTIME) + # List of files that are buildable. + set(supported_files + transformational.cpp + ) + + # Add the unsupported files to LLVM_OPTIONAL_SOURCES. + set(todo_files ${sources}) + list(REMOVE_ITEM todo_files ${supported_files}) + list(APPEND LLVM_OPTIONAL_SOURCES ${todo_files}) + + # Treat all sources as CUDA files. + set(sources ${supported_files}) + set_source_files_properties(${sources} PROPERTIES LANGUAGE CUDA) + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + # Allow varargs. + add_compile_options(-Xclang -fcuda-allow-variadic-functions) + endif() +endif() + +add_flang_library(FortranRuntime + ${sources} LINK_LIBS FortranDecimal Index: flang/runtime/copy.h =================================================================== --- flang/runtime/copy.h +++ flang/runtime/copy.h @@ -18,11 +18,13 @@ // Assigns to uninitialized storage. // Duplicates allocatable & automatic components. -void CopyElement(const Descriptor &to, const SubscriptValue toAt[], - const Descriptor &from, const SubscriptValue fromAt[], Terminator &); +HOST_DEVICE_ATTRS void CopyElement(const Descriptor &to, + const SubscriptValue toAt[], const Descriptor &from, + const SubscriptValue fromAt[], Terminator &); // Copies data from one allocated descriptor's array to another. -void CopyArray(const Descriptor &to, const Descriptor &from, Terminator &); +HOST_DEVICE_ATTRS void CopyArray( + const Descriptor &to, const Descriptor &from, Terminator &); } // namespace Fortran::runtime #endif // FORTRAN_RUNTIME_COPY_H_ Index: flang/runtime/device-tools.h =================================================================== --- /dev/null +++ flang/runtime/device-tools.h @@ -0,0 +1,37 @@ +//===-- runtime/device-tools.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_RUNTIME_DEVICE_TOOLS_H +#define FORTRAN_RUNTIME_DEVICE_TOOLS_H + +#include "flang/Runtime/device.h" +#include + +// The files defines a set of utilities/classes that might be not +// supported on the offload devices via standard libraries (e.g. STD C++). +// In some cases the implementations are temporary until the device +// toolchains implement support on their side. + +namespace Fortran::runtime { + +// Provides alternative implementation for std::fill_n(), if +// it is not supported. +template +static inline HOST_DEVICE_ATTRS void FillN( + A *start, std::size_t count, const A &value) { +#if defined(CUDA_DEVICE) && defined(__clang__) + // Clang does not support std::fill_n as a device function. + for (std::size_t j{0}; j < count; ++j) + start[j] = value; +#else + std::fill_n(start, count, value); +#endif +} + +} // namespace Fortran::runtime +#endif // FORTRAN_RUNTIME_DEVICE_TOOLS_H Index: flang/runtime/terminator.h =================================================================== --- flang/runtime/terminator.h +++ flang/runtime/terminator.h @@ -11,6 +11,7 @@ #ifndef FORTRAN_RUNTIME_TERMINATOR_H_ #define FORTRAN_RUNTIME_TERMINATOR_H_ +#include "flang/Runtime/device.h" #include namespace Fortran::runtime { @@ -21,7 +22,8 @@ public: Terminator() {} Terminator(const Terminator &) = default; - explicit Terminator(const char *sourceFileName, int sourceLine = 0) + explicit HOST_DEVICE_ATTRS Terminator( + const char *sourceFileName, int sourceLine = 0) : sourceFileName_{sourceFileName}, sourceLine_{sourceLine} {} const char *sourceFileName() const { return sourceFileName_; } @@ -31,11 +33,16 @@ sourceFileName_ = sourceFileName; sourceLine_ = sourceLine; } - [[noreturn]] void Crash(const char *message, ...) const; - [[noreturn]] void CrashArgs(const char *message, va_list &) const; - [[noreturn]] void CheckFailed( + + // CUDA_TODO: Clang for CUDA does not support varargs, though + // it compiles it with -fcuda-allow-variadic-functions. + // We can try to replace varargs functions with variadic templates. + [[noreturn]] HOST_DEVICE_ATTRS void Crash(const char *message, ...) const; + [[noreturn]] HOST_DEVICE_ATTRS void CrashArgs( + const char *message, va_list &) const; + [[noreturn]] HOST_DEVICE_ATTRS void CheckFailed( const char *predicate, const char *file, int line) const; - [[noreturn]] void CheckFailed(const char *predicate) const; + [[noreturn]] HOST_DEVICE_ATTRS void CheckFailed(const char *predicate) const; // For test harnessing - overrides CrashArgs(). static void RegisterCrashHandler(void (*)(const char *sourceFile, Index: flang/runtime/tools.h =================================================================== --- flang/runtime/tools.h +++ flang/runtime/tools.h @@ -9,6 +9,7 @@ #ifndef FORTRAN_RUNTIME_TOOLS_H_ #define FORTRAN_RUNTIME_TOOLS_H_ +#include "device-tools.h" #include "terminator.h" #include "flang/Runtime/cpp-type.h" #include "flang/Runtime/descriptor.h" @@ -39,7 +40,7 @@ char *to, std::size_t toLength, const char *from); // Utility for dealing with elemental LOGICAL arguments -inline bool IsLogicalElementTrue( +inline HOST_DEVICE_ATTRS bool IsLogicalElementTrue( const Descriptor &logical, const SubscriptValue at[]) { // A LOGICAL value is false if and only if all of its bytes are zero. const char *p{logical.Element(at)}; @@ -52,8 +53,8 @@ } // Check array conformability; a scalar 'x' conforms. Crashes on error. -void CheckConformability(const Descriptor &to, const Descriptor &x, - Terminator &, const char *funcName, const char *toName, +HOST_DEVICE_ATTRS void CheckConformability(const Descriptor &to, + const Descriptor &x, Terminator &, const char *funcName, const char *toName, const char *fromName); // Helper to store integer value in result[at]. @@ -66,7 +67,8 @@ }; // Validate a KIND= argument -void CheckIntegerKind(Terminator &, int kind, const char *intrinsic); +HOST_DEVICE_ATTRS void CheckIntegerKind( + Terminator &, int kind, const char *intrinsic); template inline void PutContiguousConverted(TO *to, FROM *from, std::size_t count) { @@ -75,7 +77,7 @@ } } -static inline std::int64_t GetInt64( +static inline HOST_DEVICE_ATTRS std::int64_t GetInt64( const char *p, std::size_t bytes, Terminator &terminator) { switch (bytes) { case 1: @@ -116,7 +118,7 @@ // arguments. template