Index: libomptarget/include/omptarget.h =================================================================== --- libomptarget/include/omptarget.h +++ libomptarget/include/omptarget.h @@ -47,6 +47,8 @@ OMP_TGT_MAPTYPE_LITERAL = 0x100, // mapping is implicit OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + // copy data to device + OMP_TGT_MAPTYPE_CLOSE = 0x400, // member of struct, member given by [16 MSBs] - 1 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 }; Index: libomptarget/src/device.h =================================================================== --- libomptarget/src/device.h +++ libomptarget/src/device.h @@ -132,11 +132,13 @@ long getMapEntryRefCnt(void *HstPtrBegin); LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true); + bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true, + bool HasCloseModifier = false); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount, bool &IsHostPtr); - int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete); + int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete, + bool HasCloseModifier = false); int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); int disassociatePtr(void *HstPtrBegin); Index: libomptarget/src/device.cpp =================================================================== --- libomptarget/src/device.cpp +++ libomptarget/src/device.cpp @@ -158,7 +158,7 @@ // to do an illegal mapping. void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, - bool UpdateRefCount) { + bool UpdateRefCount, bool HasCloseModifier) { void *rc = NULL; IsHostPtr = false; DataMapMtx.lock(); @@ -192,9 +192,9 @@ // privatized use host address. Any explicitly mapped variables also use // host address where correctness is not impeded. In all other cases // maps are respected. - // TODO: In addition to the mapping rules above, when the close map - // modifier is implemented, foce the mapping of the variable to the device. - if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { + // In addition to the mapping rules above, the close map + // modifier forces the mapping of the variable to the device. + if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) { DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : "")); IsHostPtr = true; @@ -204,8 +204,8 @@ IsNew = true; uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " - "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), - DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); + "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), + DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); rc = (void *)tp; @@ -269,8 +269,9 @@ return NULL; } -int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) { - if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) +int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete, + bool HasCloseModifier) { + if (RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) return OFFLOAD_SUCCESS; // Check if the pointer is contained in any sub-nodes. int rc; Index: libomptarget/src/omptarget.cpp =================================================================== --- libomptarget/src/omptarget.cpp +++ libomptarget/src/omptarget.cpp @@ -244,6 +244,9 @@ bool IsNew, Pointer_IsNew; bool IsHostPtr = false; bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // Force the creation of a device side copy of the data when: + // a close map modifier was associated with a map that contained a to. + bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we // have reached this point via __tgt_target_data_begin and not __tgt_target // then no argument is marked as TARGET_PARAM ("omp target data map" is not @@ -254,7 +257,8 @@ DP("Has a pointer entry: \n"); // base is address of pointer. Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, - sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef); + sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, + HasCloseModifier); if (!Pointer_TgtPtrBegin) { DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " "illegal mapping).\n"); @@ -270,7 +274,7 @@ } void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef); + data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier); if (!TgtPtrBegin && data_size) { // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. @@ -290,7 +294,8 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { bool copy = false; - if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + HasCloseModifier) { if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { copy = true; } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { @@ -370,6 +375,7 @@ bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; + bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; // If PTR_AND_OBJ, HstPtrBegin is address of pointee void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, @@ -390,7 +396,8 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; bool CopyMember = false; - if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + if (!(RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + HasCloseModifier) { if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { // Copy data only if the "parent" struct has RefCount==1. @@ -455,7 +462,8 @@ // Deallocate map if (DelEntry) { - int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete); + int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete, + HasCloseModifier); if (rt != OFFLOAD_SUCCESS) { DP("Deallocating data from device failed.\n"); return OFFLOAD_FAIL; Index: libomptarget/test/unified_shared_memory/close_enter_exit.c =================================================================== --- /dev/null +++ libomptarget/test/unified_shared_memory/close_enter_exit.c @@ -0,0 +1,95 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +#include +#include + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc = 0, *device_alloc = 0; + int *a = (int *)malloc(N * sizeof(int)); + + // Init + for (int i = 0; i < N; ++i) { + a[i] = 10; + } + host_alloc = &a[0]; + + // + // map + target no close + // +#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc) + { +#pragma omp target map(tofrom : device_alloc) + { device_alloc = &a[0]; } + } + + // CHECK: a used from unified memory. + if (device_alloc == host_alloc) + printf("a used from unified memory.\n"); + + // + // map + target with close + // + device_alloc = 0; +#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc) + { +#pragma omp target map(tofrom : device_alloc) + { device_alloc = &a[0]; } + } + // CHECK: a copied to device. + if (device_alloc != host_alloc) + printf("a copied to device.\n"); + + // + // map + use_device_ptr no close + // + device_alloc = 0; +#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a) + { device_alloc = &a[0]; } + + // CHECK: a used from unified memory with use_device_ptr. + if (device_alloc == host_alloc) + printf("a used from unified memory with use_device_ptr.\n"); + + // + // map + use_device_ptr close + // + device_alloc = 0; +#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a) + { device_alloc = &a[0]; } + + // CHECK: a used from device memory with use_device_ptr. + if (device_alloc != host_alloc) + printf("a used from device memory with use_device_ptr.\n"); + + // + // map enter/exit + close + // + device_alloc = 0; +#pragma omp target enter data map(close, to : a[ : N]) + +#pragma omp target map(from : device_alloc) + { device_alloc = &a[0]; } + +#pragma omp target exit data map(from : a[ : N]) + + // CHECK: a has been mapped to the device. + if (device_alloc != host_alloc) + printf("a has been mapped to the device.\n"); + + free(a); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +} Index: libomptarget/test/unified_shared_memory/close_manual.c =================================================================== --- /dev/null +++ libomptarget/test/unified_shared_memory/close_manual.c @@ -0,0 +1,86 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +extern void __tgt_register_requires(int64_t); + +extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + +extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc = 0, *device_alloc = 0; + int *a = (int *)malloc(N * sizeof(int)); + + // Manual registration of requires flags for Clang versions + // that do not support requires. + __tgt_register_requires(8); + + // Init + for (int i = 0; i < N; ++i) { + a[i] = 10; + } + host_alloc = &a[0]; + +// Dummy target region that ensures the runtime library is loaded when +// the target data begin/end functions are manually called below. +#pragma omp target + {} + + // Manual calls + int device_id = omp_get_default_device(); + int arg_num = 1; + void **args_base = (void **)&a; + void **args = (void **)&a; + int64_t arg_sizes[arg_num]; + + arg_sizes[0] = sizeof(int) * N; + + int64_t arg_types[arg_num]; + + // Ox400 enables the CLOSE map type in the runtime: + // OMP_TGT_MAPTYPE_CLOSE = 0x400 + // OMP_TGT_MAPTYPE_TO = 0x001 + arg_types[0] = 0x400 | 0x001; + + device_alloc = host_alloc; + + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); + +#pragma omp target data use_device_ptr(a) + { device_alloc = a; } + + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); + + // CHECK: a was copied to the device + if (device_alloc != host_alloc) + printf("a was copied to the device\n"); + + free(a); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +} Index: libomptarget/test/unified_shared_memory/close_modifier.c =================================================================== --- /dev/null +++ libomptarget/test/unified_shared_memory/close_modifier.c @@ -0,0 +1,135 @@ +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +#include +#include + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc, *device_alloc; + void *host_data, *device_data; + int *alloc = (int *)malloc(N * sizeof(int)); + int data[N]; + + for (int i = 0; i < N; ++i) { + alloc[i] = 10; + data[i] = 1; + } + + host_data = &data[0]; + host_alloc = &alloc[0]; + +// +// Test that updates on the device are not visible to host +// when only a TO mapping is used. +// +#pragma omp target map(tofrom \ + : device_data, device_alloc) map(close, to \ + : alloc[:N], data \ + [:N]) + { + device_data = &data[0]; + device_alloc = &alloc[0]; + + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + } + + // CHECK: Address of alloc on device different from host address. + if (device_alloc != host_alloc) + printf("Address of alloc on device different from host address.\n"); + + // CHECK: Address of data on device different from host address. + if (device_data != host_data) + printf("Address of data on device different from host address.\n"); + + // On the host, check that the arrays have been updated. + // CHECK: Alloc host values not updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 10) + fails++; + } + printf("Alloc host values not updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // CHECK: Data host values not updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 1) + fails++; + } + printf("Data host values not updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // + // Test that updates on the device are visible on host + // when a from is used. + // + + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + +#pragma omp target map(close, tofrom : alloc[:N], data[:N]) + { + // CHECK: Alloc device values are correct: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 11) + fails++; + } + printf("Alloc device values are correct: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + // CHECK: Data device values are correct: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 2) + fails++; + } + printf("Data device values are correct: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // Update values on the device + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + } + + // CHECK: Alloc host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 12) + fails++; + } + printf("Alloc host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // CHECK: Data host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 3) + fails++; + } + printf("Data host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + free(alloc); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +}