diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -11,11 +11,19 @@
 #include <memory>
 
 namespace mlir {
+class LLVMTypeConverter;
+class OwningRewritePatternList;
+
+template <typename OpT>
+class OperationPass;
 
 namespace gpu {
 class GPUModuleOp;
 } // namespace gpu
-template <typename OpT> class OperationPass;
+
+/// Collect a set of patterns to convert from the GPU dialect to ROCDL.
+void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
+                                          OwningRewritePatternList &patterns);
 
 /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -87,5 +87,19 @@
 def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
                                                "__ockl_get_global_size", 2>;
 
+//===----------------------------------------------------------------------===//
+// Synchronization primitives
+
+def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
+  string llvmBuilder = [{
+    llvm::LLVMContext &llvmContext = builder.getContext();
+    builder.CreateFence(llvm::AtomicOrdering::Release,
+                        llvmContext.getOrInsertSyncScopeID("workgroup"));
+    createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier);
+    builder.CreateFence(llvm::AtomicOrdering::Acquire,
+                        llvmContext.getOrInsertSyncScopeID("workgroup"));
+  }];
+  let assemblyFormat = "attr-dict";
+}
 
 #endif // ROCDLIR_OPS
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -0,0 +1,168 @@
+//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
+#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Builders.h"
+
+namespace mlir {
+
+template <unsigned AllocaAddrSpace>
+struct GPUFuncOpLowering : ConvertToLLVMPattern {
+  explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
+      : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
+                             typeConverter.getDialect()->getContext(),
+                             typeConverter) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    assert(operands.empty() && "func op is not expected to have operands");
+    auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
+    Location loc = gpuFuncOp.getLoc();
+
+    SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
+    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
+    for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
+      Value attribution = en.value();
+
+      auto type = attribution.getType().dyn_cast<MemRefType>();
+      assert(type && type.hasStaticShape() && "unexpected type in attribution");
+
+      uint64_t numElements = type.getNumElements();
+
+      auto elementType = typeConverter.convertType(type.getElementType())
+                             .template cast<LLVM::LLVMType>();
+      auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
+      std::string name = std::string(
+          llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
+      auto globalOp = rewriter.create<LLVM::GlobalOp>(
+          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
+          LLVM::Linkage::Internal, name, /*value=*/Attribute(),
+          gpu::GPUDialect::getWorkgroupAddressSpace());
+      workgroupBuffers.push_back(globalOp);
+    }
+
+    // Rewrite the original GPU function to an LLVM function.
+    auto funcType = typeConverter.convertType(gpuFuncOp.getType())
+                        .template cast<LLVM::LLVMType>()
+                        .getPointerElementTy();
+
+    // Remap proper input types.
+    TypeConverter::SignatureConversion signatureConversion(
+        gpuFuncOp.front().getNumArguments());
+    typeConverter.convertFunctionSignature(
+        gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
+
+    // Create the new function operation. Only copy those attributes that are
+    // not specific to function modeling.
+    SmallVector<NamedAttribute, 4> attributes;
+    for (const auto &attr : gpuFuncOp.getAttrs()) {
+      if (attr.first == SymbolTable::getSymbolAttrName() ||
+          attr.first == impl::getTypeAttrName() ||
+          attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
+        continue;
+      attributes.push_back(attr);
+    }
+    auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
+        LLVM::Linkage::External, attributes);
+
+    {
+      // Insert operations that correspond to converted workgroup and private
+      // memory attributions to the body of the function. This must operate on
+      // the original function, before the body region is inlined in the new
+      // function to maintain the relation between block arguments and the
+      // parent operation that assigns their semantics.
+      OpBuilder::InsertionGuard guard(rewriter);
+
+      // Rewrite workgroup memory attributions to addresses of global buffers.
+      rewriter.setInsertionPointToStart(&gpuFuncOp.front());
+      unsigned numProperArguments = gpuFuncOp.getNumArguments();
+      auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
+
+      Value zero = nullptr;
+      if (!workgroupBuffers.empty())
+        zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
+                                                 rewriter.getI32IntegerAttr(0));
+      for (auto en : llvm::enumerate(workgroupBuffers)) {
+        LLVM::GlobalOp global = en.value();
+        Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+        auto elementType = global.getType().getArrayElementType();
+        Value memory = rewriter.create<LLVM::GEPOp>(
+            loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
+            address, ArrayRef<Value>{zero, zero});
+
+        // Build a memref descriptor pointing to the buffer to plug with the
+        // existing memref infrastructure. This may use more registers than
+        // otherwise necessary given that memref sizes are fixed, but we can try
+        // and canonicalize that away later.
+        Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
+        auto type = attribution.getType().cast<MemRefType>();
+        auto descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, typeConverter, type, memory);
+        signatureConversion.remapInput(numProperArguments + en.index(), descr);
+      }
+
+      // Rewrite private memory attributions to alloca'ed buffers.
+      unsigned numWorkgroupAttributions =
+          gpuFuncOp.getNumWorkgroupAttributions();
+      auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
+      for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
+        Value attribution = en.value();
+        auto type = attribution.getType().cast<MemRefType>();
+        assert(type && type.hasStaticShape() &&
+               "unexpected type in attribution");
+
+        auto ptrType = typeConverter.convertType(type.getElementType())
+                           .template cast<LLVM::LLVMType>()
+                           .getPointerTo(AllocaAddrSpace);
+        Value numElements = rewriter.create<LLVM::ConstantOp>(
+            gpuFuncOp.getLoc(), int64Ty,
+            rewriter.getI64IntegerAttr(type.getNumElements()));
+        Value allocated = rewriter.create<LLVM::AllocaOp>(
+            gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+        auto descr = MemRefDescriptor::fromStaticShape(
+            rewriter, loc, typeConverter, type, allocated);
+        signatureConversion.remapInput(
+            numProperArguments + numWorkgroupAttributions + en.index(), descr);
+      }
+    }
+
+    // Move the region to the new function, update the entry block signature.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
+                                llvmFuncOp.end());
+    rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
+                                      signatureConversion);
+
+    rewriter.eraseOp(gpuFuncOp);
+    return success();
+  }
+};
+
+struct GPUReturnOpLowering : public ConvertToLLVMPattern {
+  GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
+      : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
+                             typeConverter.getDialect()->getContext(),
+                             typeConverter) {}
+
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
+    return success();
+  }
+};
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/FormatVariadic.h"
 
+#include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
 #include "../GPUCommon/OpToFuncCallLowering.h"
 #include "../PassDetail.h"
@@ -88,155 +89,6 @@
   }
 };
 
-struct GPUFuncOpLowering : ConvertToLLVMPattern {
-  explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
-      : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
-                             typeConverter.getDialect()->getContext(),
-                             typeConverter) {}
-
-  LogicalResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    assert(operands.empty() && "func op is not expected to have operands");
-    auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
-    Location loc = gpuFuncOp.getLoc();
-
-    SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
-    workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
-    for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
-      Value attribution = en.value();
-
-      auto type = attribution.getType().dyn_cast<MemRefType>();
-      assert(type && type.hasStaticShape() && "unexpected type in attribution");
-
-      uint64_t numElements = type.getNumElements();
-
-      auto elementType = typeConverter.convertType(type.getElementType())
-                             .cast<LLVM::LLVMType>();
-      auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
-      std::string name = std::string(
-          llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
-      auto globalOp = rewriter.create<LLVM::GlobalOp>(
-          gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
-          LLVM::Linkage::Internal, name, /*value=*/Attribute(),
-          gpu::GPUDialect::getWorkgroupAddressSpace());
-      workgroupBuffers.push_back(globalOp);
-    }
-
-    // Rewrite the original GPU function to an LLVM function.
-    auto funcType = typeConverter.convertType(gpuFuncOp.getType())
-                        .cast<LLVM::LLVMType>()
-                        .getPointerElementTy();
-
-    // Remap proper input types.
-    TypeConverter::SignatureConversion signatureConversion(
-        gpuFuncOp.front().getNumArguments());
-    typeConverter.convertFunctionSignature(
-        gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
-
-    // Create the new function operation. Only copy those attributes that are
-    // not specific to function modeling.
-    SmallVector<NamedAttribute, 4> attributes;
-    for (const auto &attr : gpuFuncOp.getAttrs()) {
-      if (attr.first == SymbolTable::getSymbolAttrName() ||
-          attr.first == impl::getTypeAttrName() ||
-          attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
-        continue;
-      attributes.push_back(attr);
-    }
-    auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
-        gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
-        LLVM::Linkage::External, attributes);
-
-    {
-      // Insert operations that correspond to converted workgroup and private
-      // memory attributions to the body of the function. This must operate on
-      // the original function, before the body region is inlined in the new
-      // function to maintain the relation between block arguments and the
-      // parent operation that assigns their semantics.
-      OpBuilder::InsertionGuard guard(rewriter);
-
-      // Rewrite workgroup memory attributions to addresses of global buffers.
-      rewriter.setInsertionPointToStart(&gpuFuncOp.front());
-      unsigned numProperArguments = gpuFuncOp.getNumArguments();
-      auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
-
-      Value zero = nullptr;
-      if (!workgroupBuffers.empty())
-        zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
-                                                 rewriter.getI32IntegerAttr(0));
-      for (auto en : llvm::enumerate(workgroupBuffers)) {
-        LLVM::GlobalOp global = en.value();
-        Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
-        auto elementType = global.getType().getArrayElementType();
-        Value memory = rewriter.create<LLVM::GEPOp>(
-            loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
-            address, ArrayRef<Value>{zero, zero});
-
-        // Build a memref descriptor pointing to the buffer to plug with the
-        // existing memref infrastructure. This may use more registers than
-        // otherwise necessary given that memref sizes are fixed, but we can try
-        // and canonicalize that away later.
-        Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
-        auto type = attribution.getType().cast<MemRefType>();
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, typeConverter, type, memory);
-        signatureConversion.remapInput(numProperArguments + en.index(), descr);
-      }
-
-      // Rewrite private memory attributions to alloca'ed buffers.
-      unsigned numWorkgroupAttributions =
-          gpuFuncOp.getNumWorkgroupAttributions();
-      auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
-      for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
-        Value attribution = en.value();
-        auto type = attribution.getType().cast<MemRefType>();
-        assert(type && type.hasStaticShape() &&
-               "unexpected type in attribution");
-
-        // Explicitly drop memory space when lowering private memory
-        // attributions since NVVM models it as `alloca`s in the default
-        // memory space and does not support `alloca`s with addrspace(5).
-        auto ptrType = typeConverter.convertType(type.getElementType())
-                           .cast<LLVM::LLVMType>()
-                           .getPointerTo();
-        Value numElements = rewriter.create<LLVM::ConstantOp>(
-            gpuFuncOp.getLoc(), int64Ty,
-            rewriter.getI64IntegerAttr(type.getNumElements()));
-        Value allocated = rewriter.create<LLVM::AllocaOp>(
-            gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
-        auto descr = MemRefDescriptor::fromStaticShape(
-            rewriter, loc, typeConverter, type, allocated);
-        signatureConversion.remapInput(
-            numProperArguments + numWorkgroupAttributions + en.index(), descr);
-      }
-    }
-
-    // Move the region to the new function, update the entry block signature.
-    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
-                                llvmFuncOp.end());
-    rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
-                                      signatureConversion);
-
-    rewriter.eraseOp(gpuFuncOp);
-    return success();
-  }
-};
-
-struct GPUReturnOpLowering : public ConvertToLLVMPattern {
-  GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
-      : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
-                             typeConverter.getDialect()->getContext(),
-                             typeConverter) {}
-
-  LogicalResult
-  matchAndRewrite(Operation *op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
-    return success();
-  }
-};
-
 /// Import the GPU Ops to NVVM Patterns.
 #include "GPUToNVVM.cpp.inc"
 
@@ -300,8 +152,11 @@
                                           NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
               GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
                                           NVVM::GridDimYOp, NVVM::GridDimZOp>,
-              GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
-          converter);
+              GPUShuffleOpLowering, GPUReturnOpLowering,
+              // Explicitly drop memory space when lowering private memory
+              // attributions since NVVM models it as `alloca`s in the default
+              // memory space and does not support `alloca`s with addrspace(5).
+              GPUFuncOpLowering<0>>(converter);
   patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
                                                 "__nv_fabs");
   patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
--- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt
@@ -1,9 +1,15 @@
+set(LLVM_TARGET_DEFINITIONS GPUToROCDL.td)
+mlir_tablegen(GPUToROCDL.cpp.inc -gen-rewriters)
+add_public_tablegen_target(MLIRGPUToROCDLIncGen)
+
 add_mlir_conversion_library(MLIRGPUtoROCDLTransforms
   LowerGpuOpsToROCDLOps.cpp
 
   DEPENDS
   MLIRConversionPassIncGen
+  MLIRGPUToROCDLIncGen
   )
+
 target_link_libraries(MLIRGPUtoROCDLTransforms
   PUBLIC
   LLVMSupport
diff --git a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td
@@ -0,0 +1,21 @@
+//==-- GPUToROCDL.td - GPU Ops to ROCDL Patterns -------------*- tablegen -*==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Patterns to lower GPU ops to ROCDL.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_GPUTOROCDL_TD
+#define MLIR_CONVERSION_GPUTOROCDL_TD
+
+include "mlir/Dialect/GPU/GPUOps.td"
+include "mlir/Dialect/LLVMIR/ROCDLOps.td"
+
+def : Pat<(GPU_BarrierOp), (ROCDL_BarrierOp)>;
+
+#endif // MLIR_CONVERSION_GPUTOROCDL_TD
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -14,11 +14,16 @@
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/Passes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/Support/FormatVariadic.h"
 
+#include "../GPUCommon/GPUOpsLowering.h"
 #include "../GPUCommon/IndexIntrinsicsOpLowering.h"
 #include "../GPUCommon/OpToFuncCallLowering.h"
 #include "../PassDetail.h"
@@ -27,6 +32,9 @@
 
 namespace {
 
+/// Import the GPU Ops to ROCDL Patterns.
+#include "GPUToROCDL.cpp.inc"
+
 // A pass that replaces all occurrences of GPU device operations with their
 // corresponding ROCDL equivalent.
 //
@@ -38,41 +46,25 @@
   void runOnOperation() override {
     gpu::GPUModuleOp m = getOperation();
 
-    OwningRewritePatternList patterns;
     LLVMTypeConverter converter(m.getContext());
-    populateStdToLLVMConversionPatterns(converter, patterns);
-    patterns.insert<
-        GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
-                                    ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
-                                    ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
-                                    ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
-        GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
-                                    ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
-        converter);
-    patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
-                                                  "__ocml_fabs_f64");
-    patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
-                                                   "__ocml_ceil_f64");
-    patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
-                                                 "__ocml_cos_f64");
-    patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
-                                                 "__ocml_exp_f64");
-    patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
-                                                 "__ocml_log_f64");
-    patterns.insert<OpToFuncCallLowering<Log10Op>>(
-        converter, "__ocml_log10_f32", "__ocml_log10_f64");
-    patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
-                                                  "__ocml_log2_f64");
-    patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
-                                                  "__ocml_tanh_f64");
 
-    ConversionTarget target(getContext());
-    target.addLegalDialect<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
+    OwningRewritePatternList patterns;
+
+    populateGpuRewritePatterns(m.getContext(), patterns);
+    applyPatternsAndFoldGreedily(m, patterns);
+    patterns.clear();
+
+    populateVectorToLLVMConversionPatterns(converter, patterns);
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    populateGpuToROCDLConversionPatterns(converter, patterns);
+    LLVMConversionTarget target(getContext());
+    target.addIllegalDialect<gpu::GPUDialect>();
     target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
                         LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
     target.addIllegalOp<FuncOp>();
+    target.addLegalDialect<ROCDL::ROCDLDialect>();
+    // TODO(whchung@gmail.com): Remove once we support replacing non-root ops.
+    target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
     if (failed(applyPartialConversion(m, target, patterns, &converter)))
       signalPassFailure();
   }
@@ -80,6 +72,37 @@
 
 } // anonymous namespace
 
+void mlir::populateGpuToROCDLConversionPatterns(
+    LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  populateWithGenerated(converter.getDialect()->getContext(), &patterns);
+  patterns.insert<
+      GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
+                                  ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
+      GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
+                                  ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
+      GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
+                                  ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
+      GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
+                                  ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
+      GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
+  patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
+                                                "__ocml_fabs_f64");
+  patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
+                                                 "__ocml_ceil_f64");
+  patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
+                                               "__ocml_cos_f64");
+  patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
+                                               "__ocml_exp_f64");
+  patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
+                                               "__ocml_log_f64");
+  patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__ocml_log10_f32",
+                                                 "__ocml_log10_f64");
+  patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
+                                                "__ocml_log2_f64");
+  patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
+                                                "__ocml_tanh_f64");
+}
+
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 mlir::createLowerGpuOpsToROCDLOpsPass() {
   return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,9 +1,10 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
   func @gpu_index_ops()
-      attributes { gpu.kernel } {
+      -> (index, index, index, index, index, index,
+          index, index, index, index, index, index) {
     // CHECK: rocdl.workitem.id.x : !llvm.i32
     %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
     // CHECK: rocdl.workitem.id.y : !llvm.i32
@@ -32,68 +33,82 @@
     // CHECK: rocdl.grid.dim.z : !llvm.i32
     %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
 
+    std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
+               %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
+        : index, index, index, index, index, index,
+          index, index, index, index, index, index
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_sync()
+  func @gpu_sync() {
+    // CHECK: rocdl.barrier
+    gpu.barrier
     std.return
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_fabs
-  func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.absf %arg_f32 : f32
     // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.absf %arg_f64 : f64
     // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_ceil
-  func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.ceilf %arg_f32 : f32
     // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.ceilf %arg_f64 : f64
     // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_cos
-  func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.cos %arg_f32 : f32
     // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.cos %arg_f64 : f64
     // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_exp
-  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %exp_f32 = std.exp %arg_f32 : f32
     // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
-    %result_f32 = std.exp %exp_f32 : f32
+    %result32 = std.exp %exp_f32 : f32
     // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.exp %arg_f64 : f64
     // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
@@ -101,20 +116,20 @@
 // -----
 
 // Test that we handled properly operation with SymbolTable other than module op
-gpu.module @kernel_module {
+gpu.module @test_module {
   "test.symbol_scope"() ({
     // CHECK: test.symbol_scope
     // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
     // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
     // CHECK-LABEL: func @gpu_exp
-    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
+    func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
       %exp_f32 = std.exp %arg_f32 : f32
       // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
-      %result_f32 = std.exp %exp_f32 : f32
+      %result32 = std.exp %exp_f32 : f32
       // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
       %result64 = std.exp %arg_f64 : f64
       // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-      std.return
+      std.return %result32, %result64 : f32, f64
     }
     "test.finish" () : () -> ()
   }) : () -> ()
@@ -122,60 +137,60 @@
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log
-  func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log %arg_f32 : f32
     // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log %arg_f64 : f64
     // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log10
-  func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log10 %arg_f32 : f32
     // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log10 %arg_f64 : f64
     // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_log2
-  func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.log2 %arg_f32 : f32
     // CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.log2 %arg_f64 : f64
     // CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
 
 // -----
 
-gpu.module @kernel_module {
+gpu.module @test_module {
   // CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double
   // CHECK-LABEL: func @gpu_tanh
-  func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) {
+  func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
     %result32 = std.tanh %arg_f32 : f32
     // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
     %result64 = std.tanh %arg_f64 : f64
     // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
-    std.return
+    std.return %result32, %result64 : f32, f64
   }
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir b/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir
@@ -0,0 +1,145 @@
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck %s
+
+gpu.module @kernel {
+  // CHECK-LABEL:  llvm.func @private
+  gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
+    // Allocate private memory inside the function.
+    // CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
+    // CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+    // "Store" lowering should work just as any other memref, only check that
+    // we emit some core instructions.
+    // CHECK: llvm.extractvalue %[[descr6:.*]]
+    // CHECK: llvm.getelementptr
+    // CHECK: llvm.store
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<4xf32, 5>
+
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+gpu.module @kernel {
+  // Workgroup buffers are allocated as globals.
+  // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
+  // CHECK-SAME:  addr_space = 3
+  // CHECK-SAME:  !llvm<"[4 x float]">
+
+  // CHECK-LABEL: llvm.func @workgroup
+  // CHECK-SAME: {
+  gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
+    // Get the address of the first element in the global array.
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+    // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
+    // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+    // CHECK-SAME: !llvm<"float addrspace(3)*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
+
+    // "Store" lowering should work just as any other memref, only check that
+    // we emit some core instructions.
+    // CHECK: llvm.extractvalue %[[descr6:.*]]
+    // CHECK: llvm.getelementptr
+    // CHECK: llvm.store
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<4xf32, 3>
+
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+gpu.module @kernel {
+  // Check that the total size was computed correctly.
+  // CHECK: llvm.mlir.global internal @[[buffer:.*]]()
+  // CHECK-SAME:  addr_space = 3
+  // CHECK-SAME:  !llvm<"[48 x float]">
+
+  // CHECK-LABEL: llvm.func @workgroup3d
+  gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
+    // Get the address of the first element in the global array.
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+    // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
+    // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
+    // CHECK-SAME: !llvm<"float addrspace(3)*">
+
+    // Populate the memref descriptor.
+    // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
+    // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
+    // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
+    // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+    // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
+    // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
+    // CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
+    // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
+    // CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+    // CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
+    // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+    // CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
+    // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
+    // CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
+    // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+    // CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
+
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
+    "terminator"() : () -> ()
+  }
+}
+
+// -----
+
+gpu.module @kernel {
+  // Check that several buffers are defined.
+  // CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
+  // CHECK-SAME:  !llvm<"[1 x float]">
+  // CHECK: llvm.mlir.global internal @[[buffer2:.*]]()
+  // CHECK-SAME:  !llvm<"[2 x float]">
+
+  // CHECK-LABEL: llvm.func @multiple
+  gpu.func @multiple(%arg0: f32)
+      workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
+      private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
+
+    // Workgroup buffers.
+    // CHECK: llvm.mlir.addressof @[[buffer1]]
+    // CHECK: llvm.mlir.addressof @[[buffer2]]
+
+    // Private buffers.
+    // CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
+    // CHECK: llvm.alloca %[[c3]] x !llvm.float
+    // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
+    // CHECK: llvm.alloca %[[c4]] x !llvm.float
+
+    %c0 = constant 0 : index
+    store %arg0, %arg1[%c0] : memref<1xf32, 3>
+    store %arg0, %arg2[%c0] : memref<2xf32, 3>
+    store %arg0, %arg3[%c0] : memref<3xf32, 5>
+    store %arg0, %arg4[%c0] : memref<4xf32, 5>
+    "terminator"() : () -> ()
+  }
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -28,3 +28,9 @@
   %11 = rocdl.grid.dim.z : !llvm.i32
   llvm.return %0 : !llvm.i32
 }
+
+func @rocdl.barrier() {
+  // CHECK: rocdl.barrier
+  rocdl.barrier
+  llvm.return
+}
diff --git a/mlir/test/Target/rocdl.mlir b/mlir/test/Target/rocdl.mlir
--- a/mlir/test/Target/rocdl.mlir
+++ b/mlir/test/Target/rocdl.mlir
@@ -29,6 +29,14 @@
   llvm.return %1 : !llvm.i32
 }
 
+llvm.func @rocdl.barrier() {
+  // CHECK:      fence syncscope("workgroup") release
+  // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+  // CHECK-NEXT: fence syncscope("workgroup") acquire
+  rocdl.barrier
+  llvm.return
+}
+
 llvm.func @kernel_func() attributes {gpu.kernel} {
   // CHECK-LABEL: amdgpu_kernel void @kernel_func
   llvm.return