Index: openmp/trunk/libomptarget/include/omptarget.h =================================================================== --- openmp/trunk/libomptarget/include/omptarget.h +++ openmp/trunk/libomptarget/include/omptarget.h @@ -48,7 +48,7 @@ OMP_TGT_MAPTYPE_LITERAL = 0x100, // mapping is implicit OMP_TGT_MAPTYPE_IMPLICIT = 0x200, - // member of struct, member given by 16 MSBs - 1 + // member of struct, member given by [16 MSBs] - 1 OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 }; Index: openmp/trunk/libomptarget/src/interface.cpp =================================================================== --- openmp/trunk/libomptarget/src/interface.cpp +++ openmp/trunk/libomptarget/src/interface.cpp @@ -33,265 +33,36 @@ RTLs.UnregisterLib(desc); } -// Following datatypes and functions (tgt_oldmap_type, combined_entry_t, -// translate_map, cleanup_map) will be removed once the compiler starts using -// the new map types. - -// Old map types -enum tgt_oldmap_type { - OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device - OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host - OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count - OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data - OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee - OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable - OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data - OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped - OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped -}; - -// Temporary functions for map translation and cleanup -struct combined_entry_t { - int num_members; // number of members in combined entry - void *base_addr; // base address of combined entry - void *begin_addr; // begin address of combined entry - void *end_addr; // size of combined entry -}; - -static void translate_map(int32_t arg_num, void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, int32_t &new_arg_num, - void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes, - int64_t *&new_arg_types, bool is_target_construct) { - if (arg_num <= 0) { - DP("Nothing to translate\n"); - new_arg_num = 0; - return; - } - - // array of combined entries - combined_entry_t *cmb_entries = - (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t)); - // number of combined entries - long num_combined = 0; - // old entry is MAP_PTR? - bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool)); - // old entry is member of member_of[old] cmb_entry - int *member_of = (int *) alloca(arg_num * sizeof(int)); - // temporary storage for modifications of the original arg_types - int64_t *mod_arg_types = (int64_t *) alloca(arg_num *sizeof(int64_t)); - - DP("Translating %d map entries\n", arg_num); - for (int i = 0; i < arg_num; ++i) { - member_of[i] = -1; - is_ptr_old[i] = false; - mod_arg_types[i] = arg_types[i]; - // Scan previous entries to see whether this entry shares the same base - for (int j = 0; j < i; ++j) { - void *new_begin_addr = NULL; - void *new_end_addr = NULL; - - if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { - if (args_base[i] == args[j]) { - if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) { - DP("Entry %d has the same base as entry %d's begin address\n", i, - j); - new_begin_addr = args_base[i]; - new_end_addr = (char *)args_base[i] + sizeof(void *); - assert(arg_sizes[j] == sizeof(void *)); - is_ptr_old[j] = true; - } else { - DP("Entry %d has the same base as entry %d's begin address, but " - "%d's base was a MAP_PTR too\n", i, j, j); - int32_t to_from_always_delete = - OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM | - OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE; - if (mod_arg_types[j] & to_from_always_delete) { - DP("Resetting to/from/always/delete flags for entry %d because " - "it is only a pointer to pointer\n", j); - mod_arg_types[j] &= ~to_from_always_delete; - } - } - } - } else { - if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) && - args_base[i] == args_base[j]) { - DP("Entry %d has the same base address as entry %d\n", i, j); - new_begin_addr = args[i]; - new_end_addr = (char *)args[i] + arg_sizes[i]; - } - } - - // If we have combined the entry with a previous one - if (new_begin_addr) { - int id; - if(member_of[j] == -1) { - // We have a new entry - id = num_combined++; - DP("Creating new combined entry %d for old entry %d\n", id, j); - // Initialize new entry - cmb_entries[id].num_members = 1; - cmb_entries[id].base_addr = args_base[j]; - if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) { - cmb_entries[id].begin_addr = args_base[j]; - cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j]; - } else { - cmb_entries[id].begin_addr = args[j]; - cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j]; - } - member_of[j] = id; - } else { - // Reuse existing combined entry - DP("Reusing existing combined entry %d\n", member_of[j]); - id = member_of[j]; - } - - // Update combined entry - DP("Adding entry %d to combined entry %d\n", i, id); - cmb_entries[id].num_members++; - // base_addr stays the same - cmb_entries[id].begin_addr = - std::min(cmb_entries[id].begin_addr, new_begin_addr); - cmb_entries[id].end_addr = - std::max(cmb_entries[id].end_addr, new_end_addr); - member_of[i] = id; - break; - } - } - } - - DP("New entries: %ld combined + %d original\n", num_combined, arg_num); - new_arg_num = arg_num + num_combined; - new_args_base = (void **) malloc(new_arg_num * sizeof(void *)); - new_args = (void **) malloc(new_arg_num * sizeof(void *)); - new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); - new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t)); - - const int64_t alignment = 8; - - int next_id = 0; // next ID - int next_cid = 0; // next combined ID - int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int)); - for (int i = 0; i < arg_num; ++i) { - // It is member_of - if (member_of[i] == next_cid) { - int cid = next_cid++; // ID of this combined entry - int nid = next_id++; // ID of the new (global) entry - combined_to_new_id[cid] = nid; - DP("Combined entry %3d will become new entry %3d\n", cid, nid); - - int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment; - if (padding) { - DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n", - padding, DPxPTR(cmb_entries[cid].begin_addr)); - cmb_entries[cid].begin_addr = - (char *)cmb_entries[cid].begin_addr - padding; - } - - new_args_base[nid] = cmb_entries[cid].base_addr; - new_args[nid] = cmb_entries[cid].begin_addr; - new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr - - (char *)cmb_entries[cid].begin_addr); - new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", " - "size %" PRId64 ", type 0x%" PRIx64 "\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid]); - } else if (member_of[i] != -1) { - DP("Combined entry %3d has been encountered before, do nothing\n", - member_of[i]); - } - - // Now that the combined entry (the one the old entry was a member of) has - // been inserted into the new arguments list, proceed with the old entry. - int nid = next_id++; - DP("Old entry %3d will become new entry %3d\n", i, nid); - - new_args_base[nid] = args_base[i]; - new_args[nid] = args[i]; - new_arg_sizes[nid] = arg_sizes[i]; - int64_t old_type = mod_arg_types[i]; - - if (is_ptr_old[i]) { - // Reset TO and FROM flags - old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM); - } - - if (member_of[i] == -1) { - if (!is_target_construct) - old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM; - new_arg_types[nid] = old_type; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 - ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid], i); - } else { - // Old entry is not FIRST_MAP - old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP; - // Add MEMBER_OF - int new_member_of = combined_to_new_id[member_of[i]]; - old_type |= ((int64_t)new_member_of + 1) << 48; - new_arg_types[nid] = old_type; - DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64 - ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid, - DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid], - new_arg_types[nid], i, new_member_of); - } - } -} - -static void cleanup_map(int32_t new_arg_num, void **new_args_base, - void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types, - int32_t arg_num, void **args_base) { - if (new_arg_num > 0) { - int offset = new_arg_num - arg_num; - for (int32_t i = 0; i < arg_num; ++i) { - // Restore old base address - args_base[i] = new_args_base[i+offset]; - } - free(new_args_base); - free(new_args); - free(new_arg_sizes); - free(new_arg_types); - } -} - /// creates host-to-target data mapping, stores it in the /// libomptarget.so internal structure (an entry in a stack of data maps) /// and passes the data to the device. EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - DP("Entering data begin region for device %ld with %d mappings\n", device_id, - arg_num); + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); // No devices available? if (device_id == OFFLOAD_DEVICE_DEFAULT) { device_id = omp_get_default_device(); - DP("Use default device id %ld\n", device_id); + DP("Use default device id %" PRId64 "\n", device_id); } if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %ld ready\n", device_id); + DP("Failed to get device %" PRId64 " ready\n", device_id); return; } DeviceTy& Device = Devices[device_id]; - // Translate maps - int32_t new_arg_num; - void **new_args_base; - void **new_args; - int64_t *new_arg_sizes; - int64_t *new_arg_types; - translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num, - new_args_base, new_args, new_arg_sizes, new_arg_types, false); - - //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types); - target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes, - new_arg_types); - - // Cleanup translation memory - cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes, - new_arg_types, arg_num, args_base); +#ifdef OMPTARGET_DEBUG + for (int i=0; i> 48) - 1; } @@ -189,10 +221,33 @@ void *HstPtrBegin = args[i]; void *HstPtrBase = args_base[i]; + int64_t data_size = arg_sizes[i]; + + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + // Address of pointer on the host and device, respectively. void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; bool IsNew, Pointer_IsNew; bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we + // have reached this point via __tgt_target_data_begin and not __tgt_target + // then no argument is marked as TARGET_PARAM ("omp target data map" is not + // associated with a target region, so there are no target parameters). This + // may be considered a hack, we could revise the scheme in the future. bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { DP("Has a pointer entry: \n"); @@ -213,28 +268,22 @@ } void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - arg_sizes[i], IsNew, IsImplicit, UpdateRef); - if (!TgtPtrBegin && arg_sizes[i]) { - // If arg_sizes[i]==0, then the argument is a pointer to NULL, so - // getOrAlloc() returning NULL is not an error. + data_size, IsNew, IsImplicit, UpdateRef); + if (!TgtPtrBegin && data_size) { + // If data_size==0, then the argument could be a zero-length pointer to + // NULL, so getOrAlloc() returning NULL is not an error. DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " "illegal mapping).\n"); } DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), (IsNew ? "" : " not")); if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { - void *ret_ptr; - if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) - ret_ptr = Pointer_TgtPtrBegin; - else { - bool IsLast; // not used - ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false); - } - - DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr)); - args_base[i] = ret_ptr; + uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; + void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); + DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); + args_base[i] = TgtPtrBase; } if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { @@ -243,7 +292,7 @@ copy = true; } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { // Copy data only if the "parent" struct has RefCount==1. - short parent_idx = member_of(arg_types[i]); + int32_t parent_idx = member_of(arg_types[i]); long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); assert(parent_rc > 0 && "parent struct not found"); if (parent_rc == 1) { @@ -253,8 +302,8 @@ if (copy) { DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]); + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size); if (rt != OFFLOAD_SUCCESS) { DP("Copying data to device failed.\n"); rc = OFFLOAD_FAIL; @@ -297,16 +346,33 @@ continue; void *HstPtrBegin = args[i]; + int64_t data_size = arg_sizes[i]; + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + bool IsLast; bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; // If PTR_AND_OBJ, HstPtrBegin is address of pointee - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, UpdateRef); DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin), + " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")); bool DelEntry = IsLast || ForceDelete; @@ -324,7 +390,7 @@ if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { // Copy data only if the "parent" struct has RefCount==1. - short parent_idx = member_of(arg_types[i]); + int32_t parent_idx = member_of(arg_types[i]); long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); assert(parent_rc > 0 && "parent struct not found"); if (parent_rc == 1) { @@ -334,8 +400,8 @@ if (DelEntry || Always || CopyMember) { DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]); + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size); if (rt != OFFLOAD_SUCCESS) { DP("Copying data from device failed.\n"); rc = OFFLOAD_FAIL; @@ -348,7 +414,7 @@ // copies. If the struct is going to be deallocated, remove any remaining // shadow pointer entries for this struct. uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i]; + uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; Device.ShadowMtx.lock(); for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); it != Device.ShadowPtrMap.end(); ++it) { @@ -378,7 +444,7 @@ // Deallocate map if (DelEntry) { - int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete); + int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete); if (rt != OFFLOAD_SUCCESS) { DP("Deallocating data from device failed.\n"); rc = OFFLOAD_FAIL;