diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1167,6 +1167,39 @@
   let hasCanonicalizer = 1;
 }
 
+def GPU_AllocManagedOp : GPU_Op<"alloc_managed", []> {
+
+  let summary = "Managed memory allocation operation.";
+  let description = [{
+    The `gpu.alloc_managed` operation allocates a region of memory that is 
+    visible to GPU and CPU. It is similar to the `memref.alloc` op, but 
+    allocated data automatically migrated between the memory spaces by driver
+    via page faults or software.
+
+    `attachHost` memory cannot be accessed by any stream on any device.
+
+    Example:
+
+    ```mlir
+    %memref = gpu.alloc_managed (%width) : memref<64x?xf32, 1>
+    ```
+  }];
+
+  let arguments = (ins  Variadic<Index>:$dynamicSizes, 
+                        OptionalAttr<UnitAttr>:$attachHost);
+  let results = (outs Res<AnyMemRef, "", [MemAlloc]>:$memref);
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return ::llvm::cast<MemRefType>(getMemref().getType()); }
+  }];
+
+  let assemblyFormat = [{
+    `(` $dynamicSizes `)` attr-dict `:` type($memref)
+  }];
+  let hasCanonicalizer = 1;
+  let hasVerifier = 1;
+}
+
 def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> {
 
   let summary = "GPU memory deallocation operation";
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -177,6 +177,11 @@
       llvmPointerType /* void * */,
       {llvmIntPtrType /* intptr_t sizeBytes */,
        llvmPointerType /* void *stream */}};
+  FunctionCallBuilder allocManagedCallBuilder = {
+      "mgpuMemAllocManaged",
+      llvmPointerType /* void * */,
+      {llvmIntPtrType /* intptr_t sizeBytes */,
+       llvmInt32Type /* unsigned flags */}};
   FunctionCallBuilder deallocCallBuilder = {
       "mgpuMemFree",
       llvmVoidType,
@@ -349,6 +354,20 @@
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+/// A rewrite pattern to convert gpu.alloc_managed operations into a GPU
+/// runtime call. Currently it supports CUDA and ROCm (HIP).
+class ConvertAllocManagedOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::AllocManagedOp> {
+public:
+  ConvertAllocManagedOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::AllocManagedOp>(typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::AllocManagedOp allocOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime
 /// call. Currently it supports CUDA and ROCm (HIP).
 class ConvertDeallocOpToGpuRuntimeCallPattern
@@ -906,6 +925,49 @@
   return success();
 }
 
+LogicalResult ConvertAllocManagedOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::AllocManagedOp allocOp, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  MemRefType memRefType = allocOp.getType();
+
+  if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
+      !isConvertibleAndHasIdentityMaps(memRefType))
+    return failure();
+
+  auto loc = allocOp.getLoc();
+
+  // Get shape of the memref as values: static sizes are constant
+  // values and dynamic sizes are passed to 'alloc' as operands.
+  SmallVector<Value, 4> shape;
+  SmallVector<Value, 4> strides;
+  Value sizeBytes;
+  getMemRefDescriptorSizes(loc, memRefType, adaptor.getDynamicSizes(), rewriter,
+                           shape, strides, sizeBytes);
+
+  // Allocate the underlying buffer and store a pointer to it in the MemRef
+  // descriptor.
+  Type elementPtrType = this->getElementPtrType(memRefType);
+  Value flags = rewriter.create<LLVM::ConstantOp>(
+      loc, llvmInt32Type, allocOp.getAttachHost() ? 2 : 1);
+  Value allocatedPtr =
+      allocManagedCallBuilder.create(loc, rewriter, {sizeBytes, flags})
+          .getResult();
+  if (!getTypeConverter()->useOpaquePointers())
+    allocatedPtr =
+        rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
+
+  // No alignment.
+  Value alignedPtr = allocatedPtr;
+
+  // Create the MemRef descriptor.
+  auto memRefDescriptor = this->createMemRefDescriptor(
+      loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter);
+
+  rewriter.replaceOp(allocOp, {memRefDescriptor});
+
+  return success();
+}
+
 LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::DeallocOp deallocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1786,6 +1848,7 @@
   addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
 
   patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
+               ConvertAllocManagedOpToGpuRuntimeCallPattern,
                ConvertDeallocOpToGpuRuntimeCallPattern,
                ConvertHostRegisterOpToGpuRuntimeCallPattern,
                ConvertHostUnregisterOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1730,6 +1730,25 @@
   return success();
 }
 
+LogicalResult AllocManagedOp::verify() {
+  auto memRefType = llvm::cast<MemRefType>(getMemref().getType());
+
+  if (static_cast<int64_t>(getDynamicSizes().size()) !=
+      memRefType.getNumDynamicDims())
+    return emitOpError("dimension operand count does not equal memref "
+                       "dynamic dimension count");
+
+  unsigned numSymbols = 0;
+  if (!memRefType.getLayout().isIdentity())
+    numSymbols = memRefType.getLayout().getAffineMap().getNumSymbols();
+  if (getSymbolOperands().size() != numSymbols) {
+    return emitOpError(
+        "symbol operand count does not equal memref symbol count");
+  }
+
+  return success();
+}
+
 namespace {
 
 /// Folding of memref.dim(gpu.alloc(%size), %idx) -> %size similar to
@@ -1765,6 +1784,11 @@
   results.add<SimplifyDimOfAllocOp>(context);
 }
 
+void AllocManagedOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                 MLIRContext *context) {
+  results.add<SimplifyDimOfAllocOp>(context);
+}
+
 #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
 #include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc"
 
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -175,6 +175,13 @@
   return reinterpret_cast<void *>(ptr);
 }
 
+extern "C" void *mgpuMemAllocManaged(uint64_t sizeBytes, unsigned int flags) {
+  ScopedContext scopedContext;
+  CUdeviceptr sharedPtr;
+  CUDA_REPORT_IF_ERROR(cuMemAllocManaged(&sharedPtr, sizeBytes, flags));
+  return reinterpret_cast<void *>(sharedPtr);
+}
+
 extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) {
   CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast<CUdeviceptr>(ptr)));
 }
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -105,6 +105,12 @@
   return ptr;
 }
 
+extern "C" void *mgpuMemAllocManaged(uint64_t sizeBytes, unsigned int flags) {
+  void *sharedPtr;
+  HIP_REPORT_IF_ERROR(hipMallocManaged(&sharedPtr, sizeBytes));
+  return sharedPtr;
+}
+
 extern "C" void mgpuMemFree(void *ptr, hipStream_t /*stream*/) {
   HIP_REPORT_IF_ERROR(hipFree(ptr));
 }
diff --git a/mlir/test/Integration/GPU/CUDA/managed.mlir b/mlir/test/Integration/GPU/CUDA/managed.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/GPU/CUDA/managed.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -gpu-kernel-outlining \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \
+// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \
+// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
+// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_async_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void -O0 \
+// RUN: | FileCheck %s
+
+// CHECK: [42,  412]
+// CHECK: Hello from GPU data[0]=42
+// CHECK: Hello from GPU data[1]=412
+// CHECK: [42,  454]
+
+func.func @main() {
+  %c0    = arith.constant 0 : index
+  %c1    = arith.constant 1 : index
+  %count = arith.constant 2 : index
+
+  // initialize h0 on host
+  %sharedPtr = gpu.alloc_managed(%count) : memref<?xi32>
+  %h0_unranked = memref.cast %sharedPtr : memref<?xi32> to memref<*xi32>
+
+  %v0 = arith.constant 42 : i32
+  %v1 = arith.constant 412 : i32
+  memref.store %v0, %sharedPtr[%c0] : memref<?xi32>
+  memref.store %v1, %sharedPtr[%c1] : memref<?xi32>
+
+  
+  call @printMemrefI32(%h0_unranked) : (memref<*xi32>) -> ()
+
+  gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
+               threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) {
+    %v2 = memref.load %sharedPtr[%c0] : memref<?xi32>
+    %v3 = memref.load %sharedPtr[%c1] : memref<?xi32>
+    %sum = arith.addi %v2, %v3 : i32
+    gpu.printf "Hello from GPU data[%lld]=%d \n" %c0, %v2  : index, i32
+    gpu.printf "Hello from GPU data[%lld]=%d \n" %c1, %v3  : index, i32
+    memref.store %sum, %sharedPtr[%c1] : memref<?xi32>
+    gpu.terminator
+  }
+  
+  call @printMemrefI32(%h0_unranked) : (memref<*xi32>) -> ()
+  return
+}
+func.func private @printMemrefI32(memref<*xi32>)