diff --git a/openmp/libomptarget/include/interop.h b/openmp/libomptarget/include/interop.h
--- a/openmp/libomptarget/include/interop.h
+++ b/openmp/libomptarget/include/interop.h
@@ -116,30 +116,6 @@
 extern const char *__KAI_KMPC_CONVENTION
 omp_get_interop_rc_desc(const omp_interop_t, omp_interop_rc_t);
 
-typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
-  /* Compiler flags */             /* Total compiler flags must be 16 bits */
-  unsigned tiedness : 1;           /* task is either tied (1) or untied (0) */
-  unsigned final : 1;              /* task is final(1) so execute immediately */
-  unsigned merged_if0 : 1; // no __kmpc_task_{begin/complete}_if0 calls in if0
-  unsigned destructors_thunk : 1; // set if the compiler creates a thunk to
-  unsigned proxy : 1; // task is a proxy task (it will be executed outside the
-  unsigned priority_specified : 1; // set if the compiler provides priority
-  unsigned detachable : 1;         // 1 == can detach */
-  unsigned unshackled : 1;         /* 1 == unshackled task */
-  unsigned target : 1;             /* 1 == target task */
-  unsigned reserved : 7;           /* reserved for compiler use */
-  unsigned tasktype : 1;    /* task is either explicit(1) or implicit (0) */
-  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
-  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
-  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
-  unsigned started : 1;     /* 1==started, 0==not started     */
-  unsigned executing : 1;   /* 1==executing, 0==not executing */
-  unsigned complete : 1;    /* 1==complete, 0==not complete   */
-  unsigned freed : 1;       /* 1==freed, 0==allocated        */
-  unsigned native : 1;      /* 1==gcc-compiled task, 0==intel */
-  unsigned reserved31 : 7;  /* reserved for library use */
-} kmp_tasking_flags_t;
-
 typedef enum omp_interop_backend_type_t {
   // reserve 0
   omp_interop_backend_type_cuda_1 = 1,
diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp
--- a/openmp/libomptarget/src/api.cpp
+++ b/openmp/libomptarget/src/api.cpp
@@ -15,6 +15,8 @@
 #include "private.h"
 #include "rtl.h"
 
+#include "llvm/ADT/SmallVector.h"
+
 #include <climits>
 #include <cstdlib>
 #include <cstring>
@@ -207,6 +209,105 @@
   return Rc;
 }
 
+// The helper function that calls omp_target_memcpy or omp_target_memcpy_rect
+static int libomp_target_memcpy_async_helper(kmp_int32 Gtid, kmp_task_t *Task) {
+  if (Task == nullptr)
+    return OFFLOAD_FAIL;
+
+  TargetMemcpyArgsTy *Args = (TargetMemcpyArgsTy *)Task->shareds;
+
+  if (Args == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Call blocked version
+  int Rc = OFFLOAD_SUCCESS;
+  if (Args->IsRectMemcpy) {
+    Rc = omp_target_memcpy_rect(
+        Args->Dst, Args->Src, Args->ElementSize, Args->NumDims, Args->Volume,
+        Args->DstOffsets, Args->SrcOffsets, Args->DstDimensions,
+        Args->SrcDimensions, Args->DstDevice, Args->SrcDevice);
+
+    DP("omp_target_memcpy_rect returns %d\n", Rc);
+  } else {
+    Rc = omp_target_memcpy(Args->Dst, Args->Src, Args->Length, Args->DstOffset,
+                           Args->SrcOffset, Args->DstDevice, Args->SrcDevice);
+
+    DP("omp_target_memcpy returns %d\n", Rc);
+  }
+
+  // Release the arguments object
+  delete Args;
+
+  return Rc;
+}
+
+// Allocate and launch helper task
+static int libomp_helper_task_creation(TargetMemcpyArgsTy *Args,
+                                       int DepObjCount,
+                                       omp_depend_t *DepObjList) {
+  // Create global thread ID
+  int Gtid = __kmpc_global_thread_num(nullptr);
+  int (*Fn)(kmp_int32, kmp_task_t *) = &libomp_target_memcpy_async_helper;
+
+  // Setup the hidden helper flags;
+  kmp_int32 Flags = 0;
+  kmp_tasking_flags_t *InputFlags = (kmp_tasking_flags_t *)&Flags;
+  InputFlags->hidden_helper = 1;
+
+  // Alloc helper task
+  kmp_task_t *Ptr = __kmpc_omp_target_task_alloc(nullptr, Gtid, Flags,
+                                                 sizeof(kmp_task_t), 0, Fn, -1);
+
+  if (Ptr == nullptr) {
+    // Task allocation failed, delete the argument object
+    delete Args;
+
+    return OFFLOAD_FAIL;
+  }
+
+  // Setup the arguments passed to helper task
+  Ptr->shareds = Args;
+
+  // Convert the type of depend objects
+  llvm::SmallVector<kmp_depend_info_t> DepObjs;
+  for (int i = 0; i < DepObjCount; i++) {
+    omp_depend_t DepObj = DepObjList[i];
+    DepObjs.push_back(*((kmp_depend_info_t *)DepObj));
+  }
+
+  // Launch the helper task
+  int Rc = __kmpc_omp_task_with_deps(nullptr, Gtid, Ptr, DepObjCount,
+                                     DepObjs.data(), 0, nullptr);
+
+  return Rc;
+}
+
+EXTERN int omp_target_memcpy_async(void *Dst, const void *Src, size_t Length,
+                                   size_t DstOffset, size_t SrcOffset,
+                                   int DstDevice, int SrcDevice,
+                                   int DepObjCount, omp_depend_t *DepObjList) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
+     "src offset %zu, length %zu\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DstOffset, SrcOffset,
+     Length);
+
+  // Check the source and dest address
+  if (Dst == nullptr || Src == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Create task object
+  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+      Dst, Src, Length, DstOffset, SrcOffset, DstDevice, SrcDevice);
+
+  // Create and launch helper task
+  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+
+  DP("omp_target_memcpy_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int
 omp_target_memcpy_rect(void *Dst, const void *Src, size_t ElementSize,
                        int NumDims, const size_t *Volume,
@@ -267,6 +368,43 @@
   return Rc;
 }
 
+EXTERN int omp_target_memcpy_rect_async(
+    void *Dst, const void *Src, size_t ElementSize, int NumDims,
+    const size_t *Volume, const size_t *DstOffsets, const size_t *SrcOffsets,
+    const size_t *DstDimensions, const size_t *SrcDimensions, int DstDevice,
+    int SrcDevice, int DepObjCount, omp_depend_t *DepObjList) {
+  TIMESCOPE();
+  DP("Call to omp_target_memcpy_rect_async, dst device %d, src device %d, "
+     "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
+     "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
+     "volume " DPxMOD ", element size %zu, num_dims %d\n",
+     DstDevice, SrcDevice, DPxPTR(Dst), DPxPTR(Src), DPxPTR(DstOffsets),
+     DPxPTR(SrcOffsets), DPxPTR(DstDimensions), DPxPTR(SrcDimensions),
+     DPxPTR(Volume), ElementSize, NumDims);
+
+  // Need to check this first to not return OFFLOAD_FAIL instead
+  if (!Dst && !Src) {
+    DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
+       INT_MAX);
+    return INT_MAX;
+  }
+
+  // Check the source and dest address
+  if (Dst == nullptr || Src == nullptr)
+    return OFFLOAD_FAIL;
+
+  // Create task object
+  TargetMemcpyArgsTy *Args = new TargetMemcpyArgsTy(
+      Dst, Src, ElementSize, NumDims, Volume, DstOffsets, SrcOffsets,
+      DstDimensions, SrcDimensions, DstDevice, SrcDevice);
+
+  // Create and launch helper task
+  int Rc = libomp_helper_task_creation(Args, DepObjCount, DepObjList);
+
+  DP("omp_target_memcpy_rect_async returns %d\n", Rc);
+  return Rc;
+}
+
 EXTERN int omp_target_associate_ptr(const void *HostPtr, const void *DevicePtr,
                                     size_t Size, size_t DeviceOffset,
                                     int DeviceNum) {
diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports
--- a/openmp/libomptarget/src/exports
+++ b/openmp/libomptarget/src/exports
@@ -41,6 +41,8 @@
     omp_target_is_present;
     omp_target_memcpy;
     omp_target_memcpy_rect;
+    omp_target_memcpy_async;
+    omp_target_memcpy_rect_async;
     omp_target_associate_ptr;
     omp_target_disassociate_ptr;
     llvm_omp_target_alloc_host;
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -104,7 +104,51 @@
  * We maintain the same data structure for compatibility.
  */
 typedef int kmp_int32;
+typedef int64_t kmp_int64;
 typedef intptr_t kmp_intptr_t;
+
+typedef void *omp_depend_t;
+struct kmp_task;
+typedef kmp_int32 (*kmp_routine_entry_t)(kmp_int32, struct kmp_task *);
+typedef struct kmp_task {
+  void *shareds;
+  kmp_routine_entry_t routine;
+  kmp_int32 part_id;
+} kmp_task_t;
+
+typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */
+  /* Compiler flags */             /* Total compiler flags must be 16 bits */
+  unsigned tiedness : 1;           /* task is either tied (1) or untied (0) */
+  unsigned final : 1;              /* task is final(1) so execute immediately */
+  unsigned merged_if0 : 1; /* no __kmpc_task_{begin/complete}_if0 calls in if0
+                              code path */
+  unsigned destructors_thunk : 1; /* set if the compiler creates a thunk to
+                                     invoke destructors from the runtime */
+  unsigned proxy : 1; /* task is a proxy task (it will be executed outside the
+                         context of the RTL) */
+  unsigned priority_specified : 1; /* set if the compiler provides priority
+                                      setting for the task */
+  unsigned detachable : 1;         /* 1 == can detach */
+  unsigned hidden_helper : 1;      /* 1 == hidden helper task */
+  unsigned reserved : 8;           /* reserved for compiler use */
+
+  /* Library flags */       /* Total library flags must be 16 bits */
+  unsigned tasktype : 1;    /* task is either explicit(1) or implicit (0) */
+  unsigned task_serial : 1; // task is executed immediately (1) or deferred (0)
+  unsigned tasking_ser : 1; // all tasks in team are either executed immediately
+  // (1) or may be deferred (0)
+  unsigned team_serial : 1; // entire team is serial (1) [1 thread] or parallel
+  // (0) [>= 2 threads]
+  /* If either team_serial or tasking_ser is set, task team may be NULL */
+  /* Task State Flags: */
+  unsigned started : 1;    /* 1==started, 0==not started     */
+  unsigned executing : 1;  /* 1==executing, 0==not executing */
+  unsigned complete : 1;   /* 1==complete, 0==not complete   */
+  unsigned freed : 1;      /* 1==freed, 0==allocated        */
+  unsigned native : 1;     /* 1==gcc-compiled task, 0==intel */
+  unsigned reserved31 : 7; /* reserved for library use */
+} kmp_tasking_flags_t;
+
 // Compiler sends us this info:
 typedef struct kmp_depend_info {
   kmp_intptr_t base_addr;
@@ -126,6 +170,86 @@
 void **__kmpc_omp_get_target_async_handle_ptr(kmp_int32 gtid)
     __attribute__((weak));
 bool __kmpc_omp_has_task_team(kmp_int32 gtid) __attribute__((weak));
+kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, kmp_int32 gtid,
+                                  kmp_int32 flags, size_t sizeof_kmp_task_t,
+                                  size_t sizeof_shareds,
+                                  kmp_routine_entry_t task_entry)
+    __attribute__((weak));
+
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, kmp_int32 gtid, kmp_int32 flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry,
+                             kmp_int64 device_id) __attribute__((weak));
+
+kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid,
+                                    kmp_task_t *new_task, kmp_int32 ndeps,
+                                    kmp_depend_info_t *dep_list,
+                                    kmp_int32 ndeps_noalias,
+                                    kmp_depend_info_t *noalias_dep_list)
+    __attribute__((weak));
+
+/**
+ * The argument set that is passed from asynchronous memory copy to block
+ * version of memory copy invoked in helper task
+ */
+struct TargetMemcpyArgsTy {
+  /**
+   * Common attribuutes
+   */
+  void *Dst;
+  const void *Src;
+  int DstDevice;
+  int SrcDevice;
+
+  /**
+   * The flag that denotes single dimensional or rectangle dimensional copy
+   */
+  bool IsRectMemcpy;
+
+  /**
+   * Arguments for single dimensional copy
+   */
+  size_t Length;
+  size_t DstOffset;
+  size_t SrcOffset;
+
+  /**
+   * Arguments for rectangle dimensional copy
+   */
+  size_t ElementSize;
+  int NumDims;
+  const size_t *Volume;
+  const size_t *DstOffsets;
+  const size_t *SrcOffsets;
+  const size_t *DstDimensions;
+  const size_t *SrcDimensions;
+
+  /**
+   * Constructor for single dimensional copy
+   */
+  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t Length,
+                     size_t DstOffset, size_t SrcOffset, int DstDevice,
+                     int SrcDevice)
+      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+        IsRectMemcpy(false), Length(Length), DstOffset(DstOffset),
+        SrcOffset(SrcOffset), ElementSize(0), NumDims(0), Volume(0),
+        DstOffsets(0), SrcOffsets(0), DstDimensions(0), SrcDimensions(0){};
+
+  /**
+   * Constructor for rectangle dimensional copy
+   */
+  TargetMemcpyArgsTy(void *Dst, const void *Src, size_t ElementSize,
+                     int NumDims, const size_t *Volume,
+                     const size_t *DstOffsets, const size_t *SrcOffsets,
+                     const size_t *DstDimensions, const size_t *SrcDimensions,
+                     int DstDevice, int SrcDevice)
+      : Dst(Dst), Src(Src), DstDevice(DstDevice), SrcDevice(SrcDevice),
+        IsRectMemcpy(true), Length(0), DstOffset(0), SrcOffset(0),
+        ElementSize(ElementSize), NumDims(NumDims), Volume(Volume),
+        DstOffsets(DstOffsets), SrcOffsets(SrcOffsets),
+        DstDimensions(DstDimensions), SrcDimensions(SrcDimensions){};
+};
 // Invalid GTID as defined by libomp; keep in sync
 #define KMP_GTID_DNE (-2)
 #ifdef __cplusplus
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_async1.c b/openmp/libomptarget/test/api/omp_target_memcpy_async1.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_async1.c
@@ -0,0 +1,48 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+// Test case for omp_target_memcpy_async, oringally from GCC
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  int q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  p = omp_target_alloc(130 * sizeof(int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = i;
+
+  if (omp_target_memcpy_async(p, q, 128 * sizeof(int), sizeof(int), 0, d, id, 0,
+                              NULL)) {
+    abort();
+  }
+
+#pragma omp taskwait
+
+  int q2[128];
+  for (i = 0; i < 128; ++i)
+    q2[i] = 0;
+  if (omp_target_memcpy_async(q2, p, 128 * sizeof(int), 0, sizeof(int), id, d,
+                              0, NULL))
+    abort();
+
+#pragma omp taskwait
+
+  for (i = 0; i < 128; ++i)
+    if (q2[i] != q[i])
+      abort();
+
+  omp_target_free(p, d);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_async2.c b/openmp/libomptarget/test/api/omp_target_memcpy_async2.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_async2.c
@@ -0,0 +1,73 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include "stdio.h"
+#include <omp.h>
+#include <stdlib.h>
+
+int main() {
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  int a[128], b[64], c[32], e[16], q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  p = omp_target_alloc(130 * sizeof(int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; ++i)
+    a[i] = i + 1;
+  for (i = 0; i < 64; ++i)
+    b[i] = i + 2;
+  for (i = 0; i < 32; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = i + 4;
+
+  omp_depend_t obj[2];
+
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend(out : p)
+    omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(inout : p)
+    omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(out : c)
+    for (i = 0; i < 32; i++)
+      c[i] = i + 3;
+
+#pragma omp depobj(obj[0]) depend(inout : p)
+#pragma omp depobj(obj[1]) depend(in : c)
+    omp_target_memcpy_async(p, c, 32 * sizeof(int), 0, 0, d, id, 2, obj);
+
+#pragma omp task depend(in : p)
+    omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
+  }
+
+#pragma omp taskwait
+
+  for (i = 0; i < 128; ++i)
+    q[i] = 0;
+  omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d);
+  for (i = 0; i < 16; ++i)
+    if (q[i] != i + 4)
+      abort();
+  for (i = 16; i < 32; ++i)
+    if (q[i] != i + 3)
+      abort();
+  for (i = 32; i < 64; ++i)
+    if (q[i] != i + 2)
+      abort();
+  for (i = 64; i < 128; ++i)
+    if (q[i] != i + 1)
+      abort();
+
+  omp_target_free(p, d);
+
+  return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async1.c
@@ -0,0 +1,66 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int main() {
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  int q[128], q2[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  p = omp_target_alloc(130 * sizeof(int), d);
+  if (p == NULL)
+    return 0;
+
+  if (omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+                                   NULL, d, id, 0, NULL) < 3 ||
+      omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+                                   NULL, id, d, 0, NULL) < 3 ||
+      omp_target_memcpy_rect_async(NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+                                   NULL, id, id, 0, NULL) < 3)
+    abort();
+
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
+    abort();
+
+  for (i = 0; i < 128; i++)
+    q[i] = i + 1;
+
+  size_t volume[NUM_DIMS] = {1, 2, 3};
+  size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
+  size_t src_offsets[NUM_DIMS] = {0, 0, 0};
+  size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
+  size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
+
+  if (omp_target_memcpy_rect_async(p, q, sizeof(int), NUM_DIMS, volume,
+                                   dst_offsets, src_offsets, dst_dimensions,
+                                   src_dimensions, d, id, 0, NULL) != 0)
+    abort();
+
+#pragma omp taskwait
+
+  for (i = 0; i < 128; i++)
+    q2[i] = 0;
+  if (omp_target_memcpy(q2, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+    abort();
+
+  /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0  */
+  if (q2[0] != 1 || q2[1] != 2 || q2[2] != 3 || q2[3] != 0 || q2[4] != 0 ||
+      q2[5] != 5 || q2[6] != 6 || q2[7] != 7)
+    abort();
+  for (i = 8; i < 128; ++i)
+    if (q2[i] != 0)
+      abort();
+
+  omp_target_free(p, d);
+  return 0;
+}
diff --git a/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/test/api/omp_target_memcpy_rect_async2.c
@@ -0,0 +1,89 @@
+// RUN: %libomptarget-compile-and-run-generic
+
+#include <omp.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int main() {
+  int d = omp_get_default_device();
+  int id = omp_get_initial_device();
+  int a[128], b[64], c[128], e[16], q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices())
+    d = id;
+
+  p = omp_target_alloc(130 * sizeof(int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy(p, q, 128 * sizeof(int), 0, 0, d, id) != 0)
+    abort();
+
+  size_t volume[NUM_DIMS] = {2, 2, 3};
+  size_t dst_offsets[NUM_DIMS] = {0, 0, 0};
+  size_t src_offsets[NUM_DIMS] = {0, 0, 0};
+  size_t dst_dimensions[NUM_DIMS] = {3, 4, 5};
+  size_t src_dimensions[NUM_DIMS] = {2, 3, 4};
+
+  for (i = 0; i < 128; i++)
+    a[i] = 42;
+  for (i = 0; i < 64; i++)
+    b[i] = 24;
+  for (i = 0; i < 128; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = 77;
+
+  omp_depend_t obj[2];
+
+#pragma omp parallel num_threads(5)
+#pragma omp single
+  {
+#pragma omp task depend(out : p)
+    omp_target_memcpy(p, a, 128 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(inout : p)
+    omp_target_memcpy(p, b, 64 * sizeof(int), 0, 0, d, id);
+
+#pragma omp task depend(out : c)
+    for (i = 0; i < 128; i++)
+      c[i] = i + 1;
+
+#pragma omp depobj(obj[0]) depend(inout : p)
+#pragma omp depobj(obj[1]) depend(in : c)
+
+    /*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+        13 14 15 - - 17 18 19 - - at positions 20..29.  */
+    omp_target_memcpy_rect_async(p, c, sizeof(int), NUM_DIMS, volume,
+                                 dst_offsets, src_offsets, dst_dimensions,
+                                 src_dimensions, d, id, 2, obj);
+
+#pragma omp task depend(in : p)
+    omp_target_memcpy(p, e, 16 * sizeof(int), 0, 0, d, id);
+  }
+
+#pragma omp taskwait
+
+  if (omp_target_memcpy(q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+    abort();
+
+  for (i = 0; i < 16; ++i)
+    if (q[i] != 77)
+      abort();
+  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18 ||
+      q[27] != 19)
+    abort();
+  for (i = 28; i < 64; ++i)
+    if (q[i] != 24)
+      abort();
+  for (i = 64; i < 128; ++i)
+    if (q[i] != 42)
+      abort();
+
+  omp_target_free(p, d);
+  return 0;
+}