Index: libomptarget/plugins/cuda/src/rtl.cpp =================================================================== --- libomptarget/plugins/cuda/src/rtl.cpp +++ libomptarget/plugins/cuda/src/rtl.cpp @@ -587,6 +587,16 @@ return NULL; } + // Any invocation of this method with a valid host pointer will + // result in the host version of this variable being used. + // If a host pointer is not present then the intent of this + // call was to allocate data on the device even if unified + // memory may have been activated. + if ((DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) && + hst_ptr) { + return hst_ptr; + } + CUdeviceptr ptr; err = cuMemAlloc(&ptr, size); if (err != CUDA_SUCCESS) { @@ -609,6 +619,14 @@ return OFFLOAD_FAIL; } + // No need to actually copy any data to device. The data is available + // on the host and can be accessed by the device via unified memory. + // When host and device pointers are equal it means that this is called + // internally by the runtime and not directly by the user via the API. + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + tgt_ptr == hst_ptr) + return OFFLOAD_SUCCESS; + err = cuMemcpyHtoD((CUdeviceptr)tgt_ptr, hst_ptr, size); if (err != CUDA_SUCCESS) { DP("Error when copying data from host to device. Pointers: host = " DPxMOD @@ -630,6 +648,12 @@ return OFFLOAD_FAIL; } + // Act as if a copy to device was successful in the case of + // unified memory where only a host version of the data exists. + if (DeviceInfo.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + hst_ptr == tgt_ptr) + return OFFLOAD_SUCCESS; + err = cuMemcpyDtoH(hst_ptr, (CUdeviceptr)tgt_ptr, size); if (err != CUDA_SUCCESS) { DP("Error when copying data from device to host. Pointers: host = " DPxMOD Index: libomptarget/src/api.cpp =================================================================== --- libomptarget/src/api.cpp +++ libomptarget/src/api.cpp @@ -113,7 +113,14 @@ DeviceTy& Device = Devices[device_num]; bool IsLast; // not used - int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL); + void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false); + int rc = (TgtPtr != NULL); + // Under unified memory the host pointer can be returned by the + // getTgtPtrBegin() function which means that there is no device + // corresponding point for ptr. This function should return false + // in that situation. + if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) + rc = (TgtPtr != ptr); DP("Call to omp_target_is_present returns %d\n", rc); return rc; } Index: libomptarget/src/device.h =================================================================== --- libomptarget/src/device.h +++ libomptarget/src/device.h @@ -137,7 +137,8 @@ long getMapEntryRefCnt(void *HstPtrBegin); LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - bool &IsNew, bool IsImplicit, bool UpdateRefCount = true); + bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true, + bool IsInUseDevicePtrClause = false); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, bool UpdateRefCount); Index: libomptarget/src/device.cpp =================================================================== --- libomptarget/src/device.cpp +++ libomptarget/src/device.cpp @@ -157,41 +157,69 @@ // If NULL is returned, then either data allocation failed or the user tried // to do an illegal mapping. void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, - int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) { + int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, + bool UpdateRefCount, bool IsInUseDevicePtrClause) { void *rc = NULL; DataMapMtx.lock(); LookupResult lr = lookupMapping(HstPtrBegin, Size); - // Check if the pointer is contained. - if (lr.Flags.IsContained || - ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { + // If unified memory is active implicitly mapped variables that are not + // privatized, use host address. Any explicitely mapped variables also use + // host address where correctness is not impeded. In all other cases + // maps are respected. + // TODO: In addition to the mapping rules above, when the close map + // modifier is implemented, foce the mapping of the variable to the device. + if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + !IsInUseDevicePtrClause && + (IsImplicit || !((lr.Flags.IsContained || + lr.Flags.ExtendsBefore || + lr.Flags.ExtendsAfter) && !Size))) { auto &HT = *lr.Entry; - IsNew = false; - - if (UpdateRefCount) - ++HT.RefCount; - - uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); - DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " - "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), - DPxPTR(HstPtrBegin), DPxPTR(tp), Size, - (UpdateRefCount ? " updated" : ""), - (CONSIDERED_INF(HT.RefCount)) ? "INF" : - std::to_string(HT.RefCount).c_str()); - rc = (void *)tp; - } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { - // Explicit extension of mapped data - not allowed. - DP("Explicit extension of mapping is not allowed.\n"); - } else if (Size) { - // If it is not contained and Size > 0 we should create a new entry for it. - IsNew = true; - uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); - DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " - "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), - DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); - HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, - (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); + uintptr_t tp = HT.HstPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", DPxPTR(tp), + Size, (UpdateRefCount ? " updated" : "")); + IsHostPtr = true; rc = (void *)tp; + } else { + // Check if the pointer is contained. + if (lr.Flags.IsContained || + ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { + auto &HT = *lr.Entry; + IsNew = false; + + if (UpdateRefCount) + ++HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), + DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { + // Explicit extension of mapped data - not allowed. + DP("Explicit extension of mapping is not allowed.\n"); + } else if (Size) { + // If it is not contained and Size > 0 we should create a new entry for it. + IsNew = true; + uintptr_t tp; + // If unified memory is active AND a use_device_ptr clause was used, + // we will force the allocation of a device variable. It means that the + // user really wants to have this variable replicate on the device. + if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + IsInUseDevicePtrClause) + tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, NULL); + else + tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " + "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), + DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); + HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, + (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); + rc = (void *)tp; + } } DataMapMtx.unlock(); @@ -207,22 +235,39 @@ DataMapMtx.lock(); LookupResult lr = lookupMapping(HstPtrBegin, Size); - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + // If the value isn't found in the mapping and unified shared memory + // is on then it means we have stumbled upon an implcitly mapped value + // which we need to use directly from the host. + if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + !(lr.Flags.IsContained || + lr.Flags.ExtendsBefore || + lr.Flags.ExtendsAfter)) { auto &HT = *lr.Entry; - IsLast = !(HT.RefCount > 1); - - if (HT.RefCount > 1 && UpdateRefCount) - --HT.RefCount; - - uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); - DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " - "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, - (UpdateRefCount ? " updated" : ""), - (CONSIDERED_INF(HT.RefCount)) ? "INF" : - std::to_string(HT.RefCount).c_str()); + IsLast = false; + uintptr_t tp = HT.HstPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", DPxPTR(tp), + Size, (UpdateRefCount ? " updated" : "")); rc = (void *)tp; } else { - IsLast = false; + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + IsLast = !(HT.RefCount > 1); + + if (HT.RefCount > 1 && UpdateRefCount) + --HT.RefCount; + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + // if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) + // tp = HT.HstPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + (CONSIDERED_INF(HT.RefCount)) ? "INF" : + std::to_string(HT.RefCount).c_str()); + rc = (void *)tp; + } else { + IsLast = false; + } } DataMapMtx.unlock(); @@ -244,6 +289,8 @@ } int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) { + if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) + return OFFLOAD_SUCCESS; // Check if the pointer is contained in any sub-nodes. int rc; DataMapMtx.lock(); Index: libomptarget/src/omptarget.cpp =================================================================== --- libomptarget/src/omptarget.cpp +++ libomptarget/src/omptarget.cpp @@ -242,7 +242,11 @@ // Address of pointer on the host and device, respectively. void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; bool IsNew, Pointer_IsNew; + bool IsHostPtr = false; bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // TODO: Check if this is correct + bool IsInUseDevicePtrClause = arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM && + arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM; // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we // have reached this point via __tgt_target_data_begin and not __tgt_target // then no argument is marked as TARGET_PARAM ("omp target data map" is not @@ -252,8 +256,10 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); // base is address of pointer. + // TODO: Check if IsInUseDevicePtrClause needs to ne passed. Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, - sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef); + sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, + IsInUseDevicePtrClause); if (!Pointer_TgtPtrBegin) { DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " "illegal mapping).\n"); @@ -269,7 +275,8 @@ } void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - data_size, IsNew, IsImplicit, UpdateRef); + data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, + IsInUseDevicePtrClause); if (!TgtPtrBegin && data_size) { // If data_size==0, then the argument could be a zero-length pointer to // NULL, so getOrAlloc() returning NULL is not an error. @@ -289,19 +296,21 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { bool copy = false; - if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { - copy = true; - } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { - // Copy data only if the "parent" struct has RefCount==1. - int32_t parent_idx = member_of(arg_types[i]); - long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); - assert(parent_rc > 0 && "parent struct not found"); - if (parent_rc == 1) { + if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { copy = true; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + copy = true; + } } } - if (copy) { + if (copy && !IsHostPtr) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); @@ -312,7 +321,7 @@ } } - if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; @@ -387,14 +396,16 @@ if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; bool CopyMember = false; - if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && - !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { - // Copy data only if the "parent" struct has RefCount==1. - int32_t parent_idx = member_of(arg_types[i]); - long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); - assert(parent_rc > 0 && "parent struct not found"); - if (parent_rc == 1) { - CopyMember = true; + if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + CopyMember = true; + } } } @@ -677,7 +688,7 @@ } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { // Allocate memory for (first-)private array TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, - arg_sizes[i], HstPtrBegin); + arg_sizes[i], NULL); if (!TgtPtrBegin) { DP ("Data allocation for %sprivate array " DPxMOD " failed, " "abort target.\n", @@ -758,6 +769,7 @@ // Deallocate (first-)private arrays for (auto it : fpArrays) { + // TODO: Verify that use_api should be true in this case. int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); if (rt != OFFLOAD_SUCCESS) { DP("Deallocation of (first-)private arrays failed.\n");