Index: openmp/libomptarget/include/interop.h =================================================================== --- openmp/libomptarget/include/interop.h +++ openmp/libomptarget/include/interop.h @@ -116,30 +116,6 @@ extern const char *__KAI_KMPC_CONVENTION omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t); -typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ - /* Compiler flags */ /* Total compiler flags must be 16 bits */ - unsigned tiedness : 1; /* task is either tied (1) or untied (0) */ - unsigned final : 1; /* task is final(1) so execute immediately */ - unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0 - unsigned destructors_thunk : 1; // set if the compiler creates a thunk to - unsigned proxy : 1; // task is a proxy task (it will be executed outside the - unsigned priority_specified : 1; // set if the compiler provides priority - unsigned detachable : 1; // 1 == can detach */ - unsigned unshackled : 1; /* 1 == unshackled task */ - unsigned target : 1; /* 1 == target task */ - unsigned reserved : 7; /* reserved for compiler use */ - unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */ - unsigned task_serial : 1; // task is executed immediately (1) or deferred (0) - unsigned tasking_ser : 1; // all tasks in team are either executed immediately - unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel - unsigned started : 1; /* 1==started, 0==not started */ - unsigned executing : 1; /* 1==executing, 0==not executing */ - unsigned complete : 1; /* 1==complete, 0==not complete */ - unsigned freed : 1; /* 1==freed, 0==allocated */ - unsigned native : 1; /* 1==gcc-compiled task, 0==intel */ - unsigned reserved31 : 7; /* reserved for library use */ -} kmp_tasking_flags_t; - typedef enum omp_interop_backend_type_t { // reserve 0 omp_interop_backend_type_cuda_1 = 1, Index: openmp/libomptarget/src/api.cpp =================================================================== --- openmp/libomptarget/src/api.cpp +++ openmp/libomptarget/src/api.cpp @@ -15,6 +15,8 @@ #include "private.h" #include "rtl.h" +#include "llvm/ADT/SmallVector.h" + #include #include #include @@ -200,6 +202,105 @@ return Rc; } +// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect +static int __kmpc_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) { + if (Task == nullptr) + return OFFLOAD_FAIL; + + TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds; + + if (Args == nullptr) + return OFFLOAD_FAIL; + + // Call blocked version + int Rc = OFFLOAD_SUCCESS; + if (Args->IsRectMemcpy) { + Rc = omp_target_memcpy_rect( + Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume, + Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions, + Args->SrcDimensions, Args->DstDevice, Args->SrcDevice); + + DP("omp_target_memcpy_rect returns %d\n", Rc); + } else { + Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset, + Args->SrcOffset, Args->DstDevice, Args->SrcDevice); + + DP("omp_target_memcpy returns %d\n", Rc); + } + + // Release the arguments object + delete Args; + + return Rc; +} + +// Allocate and launch helper task +static int __kmpc_helper_task_creation(TargetMemcpyArgsTy *Args, + int DepObjCount, + omp_depend_t *DepObjList) { + // Create global thread ID + int Gtid = __kmpc_global_thread_num(nullptr); + int (*Fn)(kmp_int32, kmp_task_t *) = &__kmpc_target_memcpy_async_helper; + + // Setup the hidden helper flags; + kmp_int32 Flags = 0; + kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags; + InputFlags->hidden_helper = 1; + + // Alloc helper task + kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags, + sizeof(kmp_task_t), 0, Fn, -1); + + if (Ptr == nullptr) { + // Task allocation failed, delete the argument object + delete Args; + + return OFFLOAD_FAIL; + } + + // Setup the arguments passed to helper task + Ptr->shareds = Args; + + // Convert the type of depend objects + llvm::SmallVector DepObjs; + for (int i = 0; i < DepObjCount; i++) { + omp_depend_t DepObj = DepObjList[i]; + DepObjs.push_back(*((kmp_depend_info_t *)DepObj)); + } + + // Launch the helper task + int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount, + DepObjs.data(), 0, nullptr); + + return Rc; +} + +EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length, + size_t DstOffset, size_t SrcOffset, + int DstDevice, int SrcDevice, + int DepObjCount, omp_depend_t *DepObjList) { + TIMESCOPE(); + DP("Call to omp_target_memcpy_async, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " + "src offset %zu, length %zu\n", + DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset, + Length); + + // Check the source and dest address + if (Dst == nullptr || Src == nullptr) + return OFFLOAD_FAIL; + + // Create task object + TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy( + Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice); + + // Create and launch helper task + int Rc = __kmpc_helper_task_creation(Args, DepObjCount, DepObjList); + + DP("omp_target_memcpy_async returns %d\n", Rc); + return Rc; +} + EXTERN int omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize, int NumDims, const size_t *Volume, @@ -260,6 +361,36 @@ return Rc; } +EXTERN int omp_target_memcpy_rect_async( + void *Dst, const void *Src, size_t ElementSize, int NumDims, + const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets, + const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice, + int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) { + TIMESCOPE(); + DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " + "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " + "volume " DPxMOD ", element size %zu, num_dims %d\n", + DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets), + DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions), + DPxPTR(Volume), ElementSize, NumDims); + + // Check the source and dest address + if (Dst == nullptr || Src == nullptr) + return OFFLOAD_FAIL; + + // Create task object + TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy( + Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets, + DstDimensions, SrcDimensions, DstDevice, SrcDevice); + + // Create and launch helper task + int Rc = __kmpc_helper_task_creation(Args, DepObjCount, DepObjList); + + DP("omp_target_memcpy_rect_async returns %d\n", Rc); + return Rc; +} + EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr, size_t Size, size_t DeviceOffset, int DeviceNum) { Index: openmp/libomptarget/src/exports =================================================================== --- openmp/libomptarget/src/exports +++ openmp/libomptarget/src/exports @@ -38,6 +38,8 @@ omp_target_is_present; omp_target_memcpy; omp_target_memcpy_rect; + omp_target_memcpy_async; + omp_target_memcpy_rect_async; omp_target_associate_ptr; omp_target_disassociate_ptr; llvm_omp_target_alloc_host; Index: openmp/libomptarget/src/private.h =================================================================== --- openmp/libomptarget/src/private.h +++ openmp/libomptarget/src/private.h @@ -98,7 +98,51 @@ * We maintain the same data structure for compatibility. */ typedef int kmp_int32; +typedef int64_t kmp_int64; typedef intptr_t kmp_intptr_t; + +typedef void *omp_depend_t; +struct kmp_task; +typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *); +typedef struct kmp_task { + void *shareds; + kmp_routine_entry_t routine; + kmp_int32 part_id; +} kmp_task_t; + +typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ + /* Compiler flags */ /* Total compiler flags must be 16 bits */ + unsigned tiedness : 1; /* task is either tied (1) or untied (0) */ + unsigned final : 1; /* task is final(1) so execute immediately */ + unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0 + code path */ + unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to + invoke destructors from the runtime */ + unsigned proxy : 1; /* task is a proxy task (it will be executed outside the + context of the RTL) */ + unsigned priority_specified : 1; /* set if the compiler provides priority + setting for the task */ + unsigned detachable : 1; /* 1 == can detach */ + unsigned hidden_helper : 1; /* 1 == hidden helper task */ + unsigned reserved : 8; /* reserved for compiler use */ + + /* Library flags */ /* Total library flags must be 16 bits */ + unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */ + unsigned task_serial : 1; // task is executed immediately (1) or deferred (0) + unsigned tasking_ser : 1; // all tasks in team are either executed immediately + // (1) or may be deferred (0) + unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel + // (0) [>= 2 threads] + /* If either team_serial or tasking_ser is set, task team may be NULL */ + /* Task State Flags: */ + unsigned started : 1; /* 1==started, 0==not started */ + unsigned executing : 1; /* 1==executing, 0==not executing */ + unsigned complete : 1; /* 1==complete, 0==not complete */ + unsigned freed : 1; /* 1==freed, 0==allocated */ + unsigned native : 1; /* 1==gcc-compiled task, 0==intel */ + unsigned reserved31 : 7; /* reserved for library use */ +} kmp_tasking_flags_t; + // Compiler sends us this info: typedef struct kmp_depend_info { kmp_intptr_t base_addr; @@ -117,6 +161,88 @@ kmp_depend_info_t *dep_list, kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list) __attribute__((weak)); + +kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid, + kmp_int32 flags, size_t sizeof_kmp_task_t, + size_t sizeof_shareds, + kmp_routine_entry_t task_entry) + __attribute__((weak)); + +kmp_task_t * +__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags, + size_t sizeof_kmp_task_t, size_t sizeof_shareds, + kmp_routine_entry_t task_entry, + kmp_int64 device_id) __attribute__((weak)); + +kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, + kmp_task_t *new_task, kmp_int32 ndeps, + kmp_depend_info_t *dep_list, + kmp_int32 ndeps_noalias, + kmp_depend_info_t *noalias_dep_list) + __attribute__((weak)); + +/** + * The argument set that is passed from asynchronous memory copy to block + * version of memory copy invoked in helper task + */ +struct TargetMemcpyArgsTy { + /** + * Common attribuutes + */ + void *Dst; + const void *Src; + int DstDevice; + int SrcDevice; + + /** + * The flag that denotes single dimensional or rectangle dimensional copy + */ + bool IsRectMemcpy; + + /** + * Arguments for single dimensional copy + */ + size_t Length; + size_t DstOffset; + size_t SrcOffset; + + /** + * Arguments for rectangle dimensional copy + */ + size_t ElementSize; + int NumDims; + const size_t *Volume; + const size_t *DstOffsets; + const size_t *SrcOffsets; + const size_t *DstDimensions; + const size_t *SrcDimensions; + + /** + * Constructor for single dimensional copy + */ + TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length, + size_t DstOffset, size_t SrcOffset, int DstDevice, + int SrcDevice) + : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice), + IsRectMemcpy(false), Length(Length), DstOffset(DstOffset), + SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0), + DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){}; + + /** + * Constructor for rectangle dimensional copy + */ + TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize, + int NumDims, const size_t *Volume, + const size_t *DstOffsets, const size_t *SrcOffsets, + const size_t *DstDimensions, const size_t *SrcDimensions, + int DstDevice, int SrcDevice) + : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice), + IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0), + ElementSize(ElementSize), NumDims(NumDims), Volume(Volume), + DstOffsets(DstOffsets), SrcOffsets(SrcOffsets), + DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){}; +}; + #ifdef __cplusplus } #endif Index: openmp/libomptarget/test/api/omp_target_memcpy_async1.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_async1.c @@ -0,0 +1,47 @@ +// Test case for omp_target_memcpy_async, oringally from GCC + +#include +#include "stdio.h" +#include +// #include + +int main () { + // __tgt_init_all_rtls(); + + int d = omp_get_default_device (); + int id = omp_get_initial_device (); + int q[128], i; + void *p; + + if (d < 0 || d >= omp_get_num_devices ()) + d = id; + + p = omp_target_alloc (130 * sizeof (int), d); + if (p == NULL) + return 0; + + for (i = 0; i < 128; i++) + q[i] = i; + + if (omp_target_memcpy_async(p, q, 128 * sizeof (int), sizeof (int), 0, d, id, 0, NULL)) { + abort(); + } + +#pragma omp taskwait + + int q2[128]; + for (i = 0; i < 128; ++i) + q2[i] = 0; + if (omp_target_memcpy_async (q2, p, 128 * sizeof(int), 0, sizeof (int), id, d, 0, NULL)) + abort (); + +#pragma omp taskwait + + for (i = 0; i < 128; ++i) + if (q2[i] != q[i]) + abort (); + + omp_target_free (p, d); + + return 0; +} Index: openmp/libomptarget/test/api/omp_target_memcpy_async2.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_async2.c @@ -0,0 +1,71 @@ +#include +#include "stdio.h" +#include + +int main() { + int d = omp_get_default_device (); + int id = omp_get_initial_device (); + int a[128], b[64], c[32], e[16], q[128], i; + void *p; + + if (d < 0 || d >= omp_get_num_devices ()) + d = id; + + p = omp_target_alloc (130 * sizeof (int), d); + if (p == NULL) + return 0; + + for (i = 0; i < 128; ++i) + a[i] = i + 1; + for (i = 0; i < 64; ++i) + b[i] = i + 2; + for (i = 0; i < 32; i++) + c[i] = 0; + for (i = 0; i < 16; i++) + e[i] = i + 4; + + omp_depend_t obj[2]; + +#pragma omp parallel num_threads(5) +#pragma omp single + { +#pragma omp task depend(out: p) + omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id); + +#pragma omp task depend(inout: p) + omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id); + +#pragma omp task depend(out: c) + for (i = 0; i < 32; i++) + c[i] = i + 3; + +#pragma omp depobj(obj[0]) depend(inout: p) +#pragma omp depobj(obj[1]) depend(in: c) + omp_target_memcpy_async (p, c, 32 * sizeof (int), 0, 0, d, id, 2, obj); + +#pragma omp task depend(in: p) + omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id); + } + +#pragma omp taskwait + + for (i = 0; i < 128; ++i) + q[i] = 0; + omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d); + for (i = 0; i < 16; ++i) + if (q[i] != i + 4) + abort(); + for (i = 16; i < 32; ++i) + if (q[i] != i + 3) + abort(); + for (i = 32; i < 64; ++i) + if (q[i] != i + 2) + abort(); + for (i = 64; i < 128; ++i) + if (q[i] != i + 1) + abort(); + + omp_target_free (p, d); + + return 0; +} Index: openmp/libomptarget/test/api/omp_target_memcpy_async3.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_async3.c @@ -0,0 +1,221 @@ +#include +#include +#include + +#define VERBOSE_TIME + +int num_timers; +double *times; +void timer_set(const char *name, int n) { + printf("%s:\n", name); + num_timers = n; + times = (double *) calloc(n, sizeof(double)); +} + +void timer_start(int i) { + if (i < 0 || i > num_timers) + abort(); + times[i] = omp_get_wtime(); +} + +void timer_end(int i) { + if (i < 0 || i > num_timers) + abort(); + double t = omp_get_wtime() - times[i]; + times[i] = t; + printf(" - Round %d: %f[sec]\n", i, t); +} + +void timer_summarize() { + double min = -1; + double max = 0; + double total = 0; + + for (int i = 0; i < num_timers; i++) { + double t = times[i]; + if (min < 0 || t < min) + min = t; + if (t > max) + max = t; + total += t; + } + double avg = total / num_timers; + printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg); +} + +void init(int *p, size_t size) { + for (int i = 0; i < size; i++) + p[i] = i; +} + +void local(int *q, size_t size) { + for (int i = 0; i < size; i++) { + int i2 = ((int) (size * 1.5)) % size; + int tmp = q[i]; + q[i] = q[i2]; + q[i2] = tmp; + } +} + +void kernel_sync(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q) { + init(p1, size1); + init(q, size2); + timer_set("kernel_sync", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3; + t1 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) + omp_target_memcpy(p_dev, p1, size1 * sizeof(int), 0, 0, d, id); + omp_target_memcpy(p2, p_dev, size1 * sizeof(int), 0, 0, id, d); +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Invoking memcpy: %f[sec]\n", t2-t1); + printf(" -- Local work: %f[sec]\n", t3-t2); +#endif + } + timer_summarize(); +} + +void kernel_async(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q) { + init(p1, size1); + init(q, size2); + timer_set("kernel_async", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3, t4, t5; + t1 = omp_get_wtime(); +#endif + + { +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) + omp_depend_t obj1[1], obj2[1]; +#pragma omp depobj(obj1[0]) depend(out: p_dev) + omp_target_memcpy_async(p_dev, p1, size1 * sizeof(int), 0, 0, d, id, 1, obj1); +#pragma omp depobj(obj2[0]) depend(in: p_dev) + omp_target_memcpy_async(p2, p_dev, size1 * sizeof(int), 0, 0, id, d, 1, obj2); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t4 = omp_get_wtime(); +#endif + +#pragma omp taskwait + } +#ifdef VERBOSE_TIME + t5 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Starting parallel region: %f[sec]\n", t2-t1); + printf(" -- Invoking memcpy async: %f[sec]\n", t3-t2); + printf(" -- Local work: %f[sec]\n", t4-t3); + printf(" -- Task wait & ending parallel region: %f[sec]\n", t5-t4); +#endif + } + timer_summarize(); +} + +void kernel_task(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q) { + init(p1, size1); + init(q, size2); + timer_set("kernel_task", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3, t4, t5; + t1 = omp_get_wtime(); +#endif + +#pragma omp parallel +#pragma omp single + { +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) +#pragma omp task depend(out: p_dev) + omp_target_memcpy(p_dev, p1, size1 * sizeof(int), 0, 0, d, id); +#pragma omp task depend(in: p_dev) + omp_target_memcpy(p2, p_dev, size1 * sizeof(int), 0, 0, id, d); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t4 = omp_get_wtime(); +#endif + +#pragma omp taskwait + } +#ifdef VERBOSE_TIME + t5 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Starting parallel region: %f[sec]\n", t2-t1); + printf(" -- Invoking memcpy as task: %f[sec]\n", t3-t2); + printf(" -- Local work: %f[sec]\n", t4-t3); + printf(" -- Task wait & ending parallel region: %f[sec]\n", t5-t4); +#endif + } + timer_summarize(); +} + +int main(int argc, char* argv[]) { + size_t size1 = (argc > 1) ? atoi(argv[1]) : 600 * 1000 * 1000; + size_t size2 = (argc > 2) ? atoi(argv[2]) : size1; + int round = (argc > 3) ? atoi(argv[3]) : 5; + printf("memory copy size = %lu, local work size = %lu, total rounds = %d\n", size1, size2, round); + + int d = omp_get_default_device(); + int id = omp_get_initial_device(); + if (d < 0 || d >= omp_get_num_devices()) + d = id; + + // Arrays for target memcpy + int *p1 = (int *) malloc(size1 * sizeof(int)); + int *p2 = (int *) malloc(size1 * sizeof(int)); + void *p_dev = omp_target_alloc(size1 * sizeof(int), d); + + // Array for local work + int *q = (int *) malloc(size2 * sizeof(int)); + + kernel_sync(d, id, size1, size2, round, p1, p2, p_dev, q); + kernel_async(d, id, size1, size2, round, p1, p2, p_dev, q); + kernel_task(d, id, size1, size2, round, p1, p2, p_dev, q); + return 0; +} Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c @@ -0,0 +1,64 @@ +#include +#include +#include + +#define NUM_DIMS 3 + +int main() { + int d = omp_get_default_device (); + int id = omp_get_initial_device (); + int q[128], q2[128], i; + void *p; + + if (d < 0 || d >= omp_get_num_devices ()) + d = id; + + p = omp_target_alloc (130 * sizeof (int), d); + if (p == NULL) + return 0; + + if (omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, + NULL, d, id, 0, NULL) < 3 + || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, + NULL, id, d, 0, NULL) < 3 + || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL, + NULL, id, id, 0, NULL) < 3) + abort (); + + for (i = 0; i < 128; i++) + q[i] = 0; + if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0) + abort (); + + for (i = 0; i < 128; i++) + q[i] = i + 1; + + size_t volume[NUM_DIMS] = { 1, 2, 3 }; + size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t src_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 }; + size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 }; + + if (omp_target_memcpy_rect_async (p, q, sizeof (int), NUM_DIMS, volume, + dst_offsets, src_offsets, dst_dimensions, + src_dimensions, d, id, 0, NULL) != 0) + abort (); + +#pragma omp taskwait + + for (i = 0; i < 128; i++) + q2[i] = 0; + if (omp_target_memcpy (q2, p, 128 * sizeof (int), 0, 0, id, d) != 0) + abort (); + + /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0 */ + if (q2[0] != 1 || q2[1] != 2 || q2[2] !=3 || q2[3] != 0 || q2[4] != 0 + || q2[5] != 5 || q2[6] != 6 || q2[7] != 7) + abort (); + for (i = 8; i < 128; ++i) + if (q2[i] != 0) + abort (); + + omp_target_free (p, d); + return 0; +} Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c @@ -0,0 +1,87 @@ +#include +#include + +#define NUM_DIMS 3 + +int main () { + int d = omp_get_default_device (); + int id = omp_get_initial_device (); + int a[128], b[64], c[128], e[16], q[128], i; + void *p; + + if (d < 0 || d >= omp_get_num_devices ()) + d = id; + + p = omp_target_alloc (130 * sizeof (int), d); + if (p == NULL) + return 0; + + for (i = 0; i < 128; i++) + q[i] = 0; + if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0) + abort (); + + size_t volume[NUM_DIMS] = { 2, 2, 3 }; + size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t src_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 }; + size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 }; + + for (i = 0; i < 128; i++) + a[i] = 42; + for (i = 0; i < 64; i++) + b[i] = 24; + for (i = 0; i < 128; i++) + c[i] = 0; + for (i = 0; i < 16; i++) + e[i] = 77; + + omp_depend_t obj[2]; + +#pragma omp parallel num_threads(5) +#pragma omp single + { +#pragma omp task depend (out: p) + omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id); + +#pragma omp task depend(inout: p) + omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id); + +#pragma omp task depend(out: c) + for (i = 0; i < 128; i++) + c[i] = i + 1; + +#pragma omp depobj(obj[0]) depend(inout: p) +#pragma omp depobj(obj[1]) depend(in: c) + + /* This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and + 13 14 15 - - 17 18 19 - - at positions 20..29. */ + omp_target_memcpy_rect_async (p, c, sizeof (int), NUM_DIMS, volume, + dst_offsets, src_offsets, dst_dimensions, + src_dimensions, d, id, 2, obj); + +#pragma omp task depend(in: p) + omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id); + } + +#pragma omp taskwait + + if (omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d) != 0) + abort (); + + for (i = 0; i < 16; ++i) + if (q[i] != 77) + abort (); + if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18 + || q[27] != 19) + abort (); + for (i = 28; i < 64; ++i) + if (q[i] != 24) + abort (); + for (i = 64; i < 128; ++i) + if (q[i] != 42) + abort (); + + omp_target_free (p, d); + return 0; +} Index: openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c =================================================================== --- /dev/null +++ openmp/libomptarget/test/api/omp_target_memcpy_rect_async3.c @@ -0,0 +1,250 @@ +#include +#include +#include + +#define NUM_DIMS 3 +#define RECT_WIDTH 4 + +#define VERBOSE_TIME + +int num_timers; +double *times; +void timer_set(const char *name, int n) { + printf("%s:\n", name); + num_timers = n; + times = (double *) calloc(n, sizeof(double)); +} + +void timer_start(int i) { + if (i < 0 || i > num_timers) + abort(); + times[i] = omp_get_wtime(); +} + +void timer_end(int i) { + if (i < 0 || i > num_timers) + abort(); + double t = omp_get_wtime() - times[i]; + times[i] = t; + printf(" - Round %d: %f[sec]\n", i, t); +} + +void timer_summarize() { + double min = -1; + double max = 0; + double total = 0; + + for (int i = 0; i < num_timers; i++) { + double t = times[i]; + if (min < 0 || t < min) + min = t; + if (t > max) + max = t; + total += t; + } + double avg = total / num_timers; + printf(" - Summary: min = %f[sec], max = %f[sec], avg = %f[sec]\n", min, max, avg); +} + +void init(int *p, size_t size) { + for (int i = 0; i < size; i++) + p[i] = i; +} + +void local(int *q, size_t size) { + for (int i = 0; i < size; i++) { + int i2 = ((int) (size * 1.5)) % size; + int tmp = q[i]; + q[i] = q[i2]; + q[i2] = tmp; + } +} + +void kernel_sync(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q, + size_t* volumes, size_t* dst_offsets, size_t* src_offsets, + size_t* dst_dimensions, size_t* src_dimensions) { + init(p1, size1 * size1 * size1); + init(q, size2); + timer_set("kernel_sync", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3; + t1 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) + omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes, dst_offsets, src_offsets, + dst_dimensions, src_dimensions, d, id); + omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes, src_offsets, dst_offsets, + src_dimensions, dst_dimensions, id, d); + +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Invoking memcpy: %f[sec]\n", t2-t1); + printf(" -- Local work: %f[sec]\n", t3-t2); +#endif + } + timer_summarize(); +} + +void kernel_async(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q, + size_t* volumes, size_t* dst_offsets, size_t* src_offsets, + size_t* dst_dimensions, size_t* src_dimensions) { + init(p1, size1 * size1 * size1); + init(q, size2); + timer_set("kernel_async", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3, t4, t5; + t1 = omp_get_wtime(); +#endif + + { +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) + omp_depend_t obj1[1], obj2[1]; +#pragma omp depobj(obj1[0]) depend(out: p_dev) + omp_target_memcpy_rect_async(p_dev, p1, sizeof(int), NUM_DIMS, volumes, + dst_offsets, src_offsets, dst_dimensions, src_dimensions, + d, id, 1, obj1); +#pragma omp depobj(obj2[0]) depend(in: p_dev) + omp_target_memcpy_rect_async(p2, p_dev, sizeof(int), NUM_DIMS, volumes, + src_offsets, dst_offsets, src_dimensions, dst_dimensions, + id, d, 1, obj2); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t4 = omp_get_wtime(); +#endif + +#pragma omp taskwait + } +#ifdef VERBOSE_TIME + t5 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Starting parallel region: %f[sec]\n", t2-t1); + printf(" -- Invoking memcpy async: %f[sec]\n", t3-t2); + printf(" -- Local work: %f[sec]\n", t4-t3); + printf(" -- Task wait & ending parallel region: %f[sec]\n", t5-t4); +#endif + } + timer_summarize(); +} + +void kernel_task(int d, int id, size_t size1, size_t size2, int round, + int *p1, int *p2, void *p_dev, int *q, + size_t* volumes, size_t* dst_offsets, size_t* src_offsets, + size_t* dst_dimensions, size_t* src_dimensions) { + init(p1, size1 * size1 * size1); + init(q, size2); + timer_set("kernel_task", round); + + for (int r = 0; r < round; r++) { + timer_start(r); +#ifdef VERBOSE_TIME + double t1, t2, t3, t4, t5; + t1 = omp_get_wtime(); +#endif + +#pragma omp parallel +#pragma omp single + { +#ifdef VERBOSE_TIME + t2 = omp_get_wtime(); +#endif + + // Ping-poing memcpy (assume offloading GPU task and retrieving result) +#pragma omp task depend(out: p_dev) + omp_target_memcpy_rect(p_dev, p1, size1 * sizeof(int), NUM_DIMS, volumes, + dst_offsets, src_offsets, dst_dimensions, src_dimensions, d, id); +#pragma omp task depend(in: p_dev) + omp_target_memcpy_rect(p2, p_dev, size1 * sizeof(int), NUM_DIMS, volumes, + src_offsets, dst_offsets, src_dimensions, dst_dimensions, id, d); +#ifdef VERBOSE_TIME + t3 = omp_get_wtime(); +#endif + + // Local task + local(q, size2); +#ifdef VERBOSE_TIME + t4 = omp_get_wtime(); +#endif + +#pragma omp taskwait + } +#ifdef VERBOSE_TIME + t5 = omp_get_wtime(); +#endif + + timer_end(r); +#ifdef VERBOSE_TIME + printf(" -- Starting parallel region: %f[sec]\n", t2-t1); + printf(" -- Invoking memcpy as task: %f[sec]\n", t3-t2); + printf(" -- Local work: %f[sec]\n", t4-t3); + printf(" -- Task wait & ending parallel region: %f[sec]\n", t5-t4); +#endif + } + timer_summarize(); +} + +int main(int argc, char* argv[]) { + size_t size1 = (argc > 1) ? atoi(argv[1]) * 20 : RECT_WIDTH * 20; + + size_t size2 = size1; + int round = (argc > 2) ? atoi(argv[2]) : 5; + + printf("memory copy size = %lu, local work size = %lu, total rounds = %d\n", size1, size2, round); + + size_t volume[NUM_DIMS] = { size1 * size1, size1, size1 }; + size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t src_offsets[NUM_DIMS] = { 0, 0, 0 }; + size_t dst_dimensions[NUM_DIMS] = { 0, 0, 0 }; + size_t src_dimensions[NUM_DIMS] = { 0, 0, 0 }; + + int d = omp_get_default_device(); + int id = omp_get_initial_device(); + if (d < 0 || d >= omp_get_num_devices()) + d = id; + + // Arrays for target memcpy + int *p1 = (int *) malloc(size1 * size1 * size1 * sizeof(int)); + int *p2 = (int *) malloc(size1 * size1 * size1 * sizeof(int)); + void *p_dev = omp_target_alloc(size1 * size1 * size1 * sizeof(int), d); + + // Array for local work + int *q = (int *) malloc(size2 * sizeof(int)); + + kernel_sync(d, id, size1, size2, round, p1, p2, p_dev, q, + volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions); + kernel_async(d, id, size1, size2, round, p1, p2, p_dev, q, + volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions); + kernel_task(d, id, size1, size2, round, p1, p2, p_dev, q, + volume, dst_offsets, src_offsets, dst_dimensions, src_dimensions); + return 0; +}