diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt --- a/openmp/runtime/CMakeLists.txt +++ b/openmp/runtime/CMakeLists.txt @@ -30,7 +30,7 @@ # If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake libomp_get_architecture(LIBOMP_DETECTED_ARCH) set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING - "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64).") + "The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64/ve).") # Should assertions be enabled? They are on by default. set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL "enable assertions?") @@ -63,6 +63,8 @@ set(LIBOMP_ARCH riscv64) elseif(LIBOMP_NATIVE_ARCH MATCHES "loongarch64") set(LIBOMP_ARCH loongarch64) + elseif(LIBOMP_NATIVE_ARCH MATCHES "ve") + set(LIBOMP_ARCH ve) else() # last ditch effort libomp_get_architecture(LIBOMP_ARCH) @@ -83,7 +85,7 @@ endif() endif() -libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64) +libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve) set(LIBOMP_LIB_TYPE normal CACHE STRING "Performance,Profiling,Stubs library (normal/profile/stubs)") @@ -162,6 +164,7 @@ set(MIPS FALSE) set(RISCV64 FALSE) set(LOONGARCH64 FALSE) +set(VE FALSE) if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32") # IA-32 architecture set(IA32 TRUE) elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture @@ -188,6 +191,8 @@ set(RISCV64 TRUE) elseif("${LIBOMP_ARCH}" STREQUAL "loongarch64") # LoongArch64 architecture set(LOONGARCH64 TRUE) +elseif("${LIBOMP_ARCH}" STREQUAL "ve") # VE architecture + set(VE TRUE) endif() # Set some flags based on build_type diff --git a/openmp/runtime/cmake/LibompGetArchitecture.cmake b/openmp/runtime/cmake/LibompGetArchitecture.cmake --- a/openmp/runtime/cmake/LibompGetArchitecture.cmake +++ b/openmp/runtime/cmake/LibompGetArchitecture.cmake @@ -49,6 +49,8 @@ #error ARCHITECTURE=riscv64 #elif defined(__loongarch__) && __loongarch_grlen == 64 #error ARCHITECTURE=loongarch64 + #elif defined(__ve__) + #error ARCHITECTURE=ve #else #error ARCHITECTURE=UnknownArchitecture #endif diff --git a/openmp/runtime/cmake/LibompUtils.cmake b/openmp/runtime/cmake/LibompUtils.cmake --- a/openmp/runtime/cmake/LibompUtils.cmake +++ b/openmp/runtime/cmake/LibompUtils.cmake @@ -111,6 +111,8 @@ set(${return_arch_string} "RISCV64" PARENT_SCOPE) elseif(${LOONGARCH64}) set(${return_arch_string} "LOONGARCH64" PARENT_SCOPE) + elseif(${VE}) + set(${return_arch_string} "VE" PARENT_SCOPE) else() set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE) libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}") diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1170,6 +1170,10 @@ #elif KMP_ARCH_X86_64 #define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024)) #define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024)) +#elif KMP_ARCH_VE +// Minimum stack size for pthread for VE is 4MB. +// https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm +#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024)) #else #define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024)) #endif diff --git a/openmp/runtime/src/kmp_affinity.h b/openmp/runtime/src/kmp_affinity.h --- a/openmp/runtime/src/kmp_affinity.h +++ b/openmp/runtime/src/kmp_affinity.h @@ -286,6 +286,17 @@ #elif __NR_sched_getaffinity != 123 #error Wrong code for getaffinity system call. #endif /* __NR_sched_getaffinity */ +#elif KMP_ARCH_VE +#ifndef __NR_sched_setaffinity +#define __NR_sched_setaffinity 203 +#elif __NR_sched_setaffinity != 203 +#error Wrong code for setaffinity system call. +#endif /* __NR_sched_setaffinity */ +#ifndef __NR_sched_getaffinity +#define __NR_sched_getaffinity 204 +#elif __NR_sched_getaffinity != 204 +#error Wrong code for getaffinity system call. +#endif /* __NR_sched_getaffinity */ #else #error Unknown or unsupported architecture #endif /* KMP_ARCH_* */ diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h --- a/openmp/runtime/src/kmp_os.h +++ b/openmp/runtime/src/kmp_os.h @@ -178,7 +178,7 @@ #if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS #define KMP_SIZE_T_SPEC KMP_UINT32_SPEC #elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE #define KMP_SIZE_T_SPEC KMP_UINT64_SPEC #else #error "Can't determine size_t printf format specifier." @@ -1043,7 +1043,7 @@ #endif /* KMP_OS_WINDOWS */ #if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE #if KMP_OS_WINDOWS #undef KMP_MB #define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst) diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h --- a/openmp/runtime/src/kmp_platform.h +++ b/openmp/runtime/src/kmp_platform.h @@ -93,6 +93,7 @@ #define KMP_ARCH_MIPS64 0 #define KMP_ARCH_RISCV64 0 #define KMP_ARCH_LOONGARCH64 0 +#define KMP_ARCH_VE 0 #if KMP_OS_WINDOWS #if defined(_M_AMD64) || defined(__x86_64) @@ -142,6 +143,9 @@ #elif defined __loongarch__ && __loongarch_grlen == 64 #undef KMP_ARCH_LOONGARCH64 #define KMP_ARCH_LOONGARCH64 1 +#elif defined __ve__ +#undef KMP_ARCH_VE +#define KMP_ARCH_VE 1 #endif #endif @@ -206,7 +210,7 @@ // TODO: Fixme - This is clever, but really fugly #if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + \ KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \ - KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64) + KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE) #error Unknown or unsupported architecture #endif diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -8830,7 +8830,7 @@ int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ - KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 + KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD diff --git a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h --- a/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h +++ b/openmp/runtime/src/thirdparty/ittnotify/ittnotify_config.h @@ -162,6 +162,10 @@ #define ITT_ARCH_ARM64 6 #endif /* ITT_ARCH_ARM64 */ +#ifndef ITT_ARCH_VE +#define ITT_ARCH_VE 8 +#endif /* ITT_ARCH_VE */ + #ifndef ITT_ARCH #if defined _M_IX86 || defined __i386__ #define ITT_ARCH ITT_ARCH_IA32 @@ -175,6 +179,8 @@ #define ITT_ARCH ITT_ARCH_ARM64 #elif defined __powerpc64__ #define ITT_ARCH ITT_ARCH_PPC64 +#elif defined __ve__ +#define ITT_ARCH ITT_ARCH_VE #endif #endif diff --git a/openmp/runtime/src/z_Linux_asm.S b/openmp/runtime/src/z_Linux_asm.S --- a/openmp/runtime/src/z_Linux_asm.S +++ b/openmp/runtime/src/z_Linux_asm.S @@ -2060,6 +2060,198 @@ #endif /* KMP_ARCH_LOONGARCH64 */ +#if KMP_ARCH_VE + +//------------------------------------------------------------------------ +// +// typedef void (*microtask_t)(int *gtid, int *tid, ...); +// +// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc, +// void *p_argv[] +// #if OMPT_SUPPORT +// , +// void **exit_frame_ptr +// #endif +// ) { +// #if OMPT_SUPPORT +// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0); +// #endif +// +// (*pkfn)(>id, &tid, argv[0], ...); +// +// return 1; +// } +// +// Parameters: +// s0: pkfn +// s1: gtid +// s2: tid +// s3: argc +// s4: p_argv +// s5: exit_frame_ptr +// +// Locals: +// __gtid: gtid param pushed on stack so can pass >id to pkfn +// __tid: tid param pushed on stack so can pass &tid to pkfn +// +// Temp. registers: +// +// s34: used to calculate the dynamic stack size +// s35: used as temporary for stack placement calculation +// s36: used as temporary for stack arguments +// s37: used as temporary for number of remaining pkfn parms +// s38: used to traverse p_argv array +// +// return: s0 (always 1/TRUE) +// + +__gtid = -4 +__tid = -8 + +// -- Begin __kmp_invoke_microtask +// mark_begin; + .text + .globl __kmp_invoke_microtask + // A function requires 8 bytes align. + .p2align 3 + .type __kmp_invoke_microtask,@function +__kmp_invoke_microtask: + .cfi_startproc + + // First, save fp and lr. VE stores them at caller stack frame. + st %fp, 0(, %sp) + st %lr, 8(, %sp) + or %fp, 0, %sp + .cfi_def_cfa %fp, 0 + .cfi_offset %lr, 8 + .cfi_offset %fp, 0 + + // Compute the dynamic stack size: + // + // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them + // by reference + // - We need 8 bytes for whole arguments. We have two + 'argc' + // arguments (condider >id and &tid). We need to reserve + // (argc + 2) * 8 bytes. + // - We need 176 bytes for RSA and others + // + // The total number of bytes is then (argc + 2) * 8 + 8 + 176. + // + // |------------------------------| + // | return address of callee | 8(%fp) + // |------------------------------| + // | frame pointer of callee | 0(%fp) + // |------------------------------| <------------------ %fp + // | __tid / __gtid | -8(%fp) / -4(%fp) + // |------------------------------| + // | argc+2 for arguments | 176(%sp) + // |------------------------------| + // | RSA | + // |------------------------------| + // | return address | + // |------------------------------| + // | frame pointer | + // |------------------------------| <------------------ %sp + + adds.w.sx %s34, 2, %s3 + sll %s34, %s34, 3 + lea %s34, 184(, %s34) + subs.l %sp, %sp, %s34 + + // Align the stack to 16 bytes. + and %sp, -16, %sp + + // Save pkfn. + or %s12, 0, %s0 + + // Call host to allocate stack if it is necessary. + brge.l %sp, %sl, .L_kmp_pass + ld %s61, 24(, %tp) + lea %s63, 0x13b + shm.l %s63, 0(%s61) + shm.l %sl, 8(%s61) + shm.l %sp, 16(%s61) + monc + +.L_kmp_pass: + lea %s35, 176(, %sp) + adds.w.sx %s37, 0, %s3 + or %s38, 0, %s4 + +#if OMPT_SUPPORT + // Save frame pointer into exit_frame. + st %fp, 0(%s5) +#endif + + // Prepare arguments for the pkfn function (first 8 using s0-s7 + // registers, but need to store stack also because of varargs). + + stl %s1, __gtid(%fp) + stl %s2, __tid(%fp) + + adds.l %s0, __gtid, %fp + st %s0, 0(, %s35) + adds.l %s1, __tid, %fp + st %s1, 8(, %s35) + + breq.l 0, %s37, .L_kmp_call + ld %s2, 0(, %s38) + st %s2, 16(, %s35) + + breq.l 1, %s37, .L_kmp_call + ld %s3, 8(, %s38) + st %s3, 24(, %s35) + + breq.l 2, %s37, .L_kmp_call + ld %s4, 16(, %s38) + st %s4, 32(, %s35) + + breq.l 3, %s37, .L_kmp_call + ld %s5, 24(, %s38) + st %s5, 40(, %s35) + + breq.l 4, %s37, .L_kmp_call + ld %s6, 32(, %s38) + st %s6, 48(, %s35) + + breq.l 5, %s37, .L_kmp_call + ld %s7, 40(, %s38) + st %s7, 56(, %s35) + + breq.l 6, %s37, .L_kmp_call + + // Prepare any additional argument passed through the stack. + adds.l %s37, -6, %s37 + lea %s38, 48(, %s38) + lea %s35, 64(, %s35) +.L_kmp_loop: + ld %s36, 0(, %s38) + st %s36, 0(, %s35) + adds.l %s37, -1, %s37 + adds.l %s38, 8, %s38 + adds.l %s35, 8, %s35 + brne.l 0, %s37, .L_kmp_loop + +.L_kmp_call: + // Call pkfn function. + bsic %lr, (, %s12) + + // Return value. + lea %s0, 1 + + // Restore stack and return. + or %sp, 0, %fp + ld %lr, 8(, %sp) + ld %fp, 0(, %sp) + b.l.t (, %lr) +.Lfunc_end0: + .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask + .cfi_endproc + +// -- End __kmp_invoke_microtask + +#endif /* KMP_ARCH_VE */ + #if KMP_ARCH_ARM || KMP_ARCH_MIPS .data COMMON .gomp_critical_user_, 32, 3 @@ -2073,7 +2265,8 @@ #endif #endif /* KMP_ARCH_ARM */ -#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 +#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \ + KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE #ifndef KMP_PREFIX_UNDERSCORE # define KMP_PREFIX_UNDERSCORE(x) x #endif @@ -2088,7 +2281,7 @@ .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8 #endif #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || - KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 */ + KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE */ #if KMP_OS_LINUX # if KMP_ARCH_ARM || KMP_ARCH_AARCH64 diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp --- a/openmp/runtime/src/z_Linux_util.cpp +++ b/openmp/runtime/src/z_Linux_util.cpp @@ -2456,7 +2456,7 @@ #if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \ ((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \ KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \ - KMP_ARCH_ARM) + KMP_ARCH_ARM || KMP_ARCH_VE) // we really only need the case with 1 argument, because CLANG always build // a struct of pointers to shared variables referenced in the outlined function diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h --- a/openmp/runtime/test/ompt/callback.h +++ b/openmp/runtime/test/ompt/callback.h @@ -221,6 +221,13 @@ printf("%" PRIu64 ": current_address=%p or %p or %p\n", \ ompt_get_thread_data()->value, ((char *)addr) - 4, \ ((char *)addr) - 8, ((char *)addr) - 12) +#elif KMP_ARCH_VE +// On VE the NOP instruction is 8 byte long. In addition, the compiler inserts +// a ??? instruction for non-void runtime functions which is ? bytes long. +#define print_possible_return_addresses(addr) \ + printf("%" PRIu64 ": current_address=%p or %p\n", \ + ompt_get_thread_data()->value, ((char *)addr) - 8, \ + ((char *)addr) - 8) #else #error Unsupported target architecture, cannot determine address offset! #endif