diff --git a/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
@@ -0,0 +1,26 @@
+//===- AMDGPUToROCDL.h - Convert AMDGPU to ROCDL dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
+#define MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
+
+#include <memory>
+
+namespace mlir {
+
+class LLVMTypeConverter;
+class RewritePatternSet;
+class Pass;
+
+void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
+                                             RewritePatternSet &patterns);
+
+std::unique_ptr<Pass> createConvertAMDGPUToROCDLPass();
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_CONVERSION_PASSES_H
 #define MLIR_CONVERSION_PASSES_H
 
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
 #include "mlir/Conversion/ArithmeticToSPIRV/ArithmeticToSPIRV.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -73,6 +73,22 @@
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// AMDGPUToROCDL
+//===----------------------------------------------------------------------===//
+
+def ConvertAMDGPUToROCDL : Pass<"convert-amdgpu-to-rocdl"> {
+  let summary = "Convert AMDGPU dialect to ROCDL dialect";
+  let description = [{
+    This pass converts supported AMDGPU ops to ROCDL dialect intrinsics.
+  }];
+  let constructor = "mlir::createConvertAMDGPUToROCDLPass()";
+  let dependentDialects = [
+    "LLVM::LLVMDialect",
+    "ROCDL::ROCDLDialect",
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // ArithmeticToLLVM
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -0,0 +1,166 @@
+//===-- AMDGPU.td - AMDGPU dialect definitions *- tablegen -*------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU
+#define AMDGPU
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/OpBase.td"
+
+def AMDGPU_Dialect : Dialect {
+  let name = "amdgpu";
+  let cppNamespace = "::mlir::amdgpu";
+  let description = [{
+    The `AMDGPU` dialect provides wrappers around AMD-specific functionality
+    and LLVM intrinsics. These wrappers should be used in conjunction with
+    more generic dialects, such as `gpu` and `vector`, when generating LLVM IR
+    that will eventually be executed on AMD hardware.
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Op definitions
+//===----------------------------------------------------------------------===//
+
+class AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
+  Op<AMDGPU_Dialect, mnemonic, traits> {}
+
+/// Raw buffer load
+def AMDGPU_RawBufferLoadOp :
+    AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>,
+      AttrSizedOperandSegments]>,
+    Arguments<(ins Arg<AnyMemRef, "buffer to load from", [MemRead]>:$memref,
+                   BoolAttr:$targetIsRDNA,
+                   Variadic<I32>:$indices,
+                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
+                   OptionalAttr<I32Attr>:$indexOffset,
+                   Optional<I32>:$sgprOffset)>,
+    Results<(outs AnyTypeOf<[BF16, F16, F32, I32, I8,
+                              VectorOfLengthAndType<[2, 4], [F32, I32]>,
+                              VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
+                              VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value)> {
+
+  let summary = "Raw Buffer load, exposing GCN features";
+  let description = [{
+    The `amdgpu.raw_buffer_load` op is a wrapper around the buffer load intrinsics
+    available on AMD GPUs, including extensions in newer GPUs.
+
+    The index into the buffer is computed as for `memref.load` with the additon
+    of `indexOffset` and `sgprOffset` (which is added after bounds checks and
+    includes any offset present on the memref type if it's non-zero).
+
+    All indices and offsets are in units of the memref's data type and are
+    converted to bytes during lowering.
+
+    When a load is out of bounds, the instruction returns zero.
+    Vector instructions bounds check each component's address.
+
+    The memref struct is converted into a buffer resource (a V#) and the arguments
+    are translated to intrinsic arguments as follows:
+    - The base address of the buffer is the base address of the memref
+    - The stride is 0 to enable raw mode
+    - The number of records is the size of the memref, in bytes
+      In the case of dynamically-shaped memrefs, this is computed at runtime
+      as max_d (size(d) * stride(d)) * sizeof(elementType(memref))
+    - The offset enable bit is 1, the index enable bit is 0.
+    - The thread ID addition bit is off
+    - If `boundsCheck` is false and the target is RDNA, OOB_SELECT is set to 2
+      to disable bounds checks, otherwise it is 0
+    - The cache coherency bits are off
+    - `targetIsRDNA` controls the setting of some reserved values that differ
+       between RDNA and CDNA cores
+  }];
+  let assemblyFormat = [{
+    attr-dict $memref `[` $indices `]`
+      (`sgprOffset` $sgprOffset^)? `:`
+      type($memref) `,` type($indices) `->` type($value)
+  }];
+  let hasVerifier = 1;
+}
+
+/// Raw buffer store
+def AMDGPU_RawBufferStoreOp :
+    AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,
+      AttrSizedOperandSegments]>,
+    Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8,
+                              VectorOfLengthAndType<[2, 4], [F32, I32]>,
+                              VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
+                              VectorOfLengthAndType<[2, 4, 8, 16], [I8]>]>:$value,
+                   Arg<AnyMemRef, "buffer to store to", [MemWrite]>:$memref,
+                   BoolAttr:$targetIsRDNA,
+                   Variadic<I32>:$indices,
+                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
+                   OptionalAttr<I32Attr>:$indexOffset,
+                   Optional<I32>:$sgprOffset)> {
+
+  let summary = "Raw Buffer Store, exposing GCN features";
+  let description = [{
+    The `amdgpu.raw_buffer_store` op is a wrapper around the buffer store
+    intrinsics available on AMD GPUs, including extensions in newer GPUs.
+
+    The store index is computed as in `memref.store` with the addition of
+    `indexOffset` (which is included for uniformity with atomics and may be useful
+    when writing vectorized code) and `sgprOffset` (which is added after bounds
+    checks and implicitly includes the offset of the memref type if non-zero).
+    All index components are in terms of the elements of the memref, not bytes,
+    and are scaled up appropriately.
+
+    Out of bounds stores are ignored in hardware, including the out of bounds
+    components of vector writes.
+
+    See `amdgpu.raw_buffer_load` for a description of how the underlying
+    instruction is constructed.
+  }];
+  let assemblyFormat = [{
+    attr-dict $value `->` $memref `[` $indices `]`
+      (`sgprOffset` $sgprOffset^)? `:`
+      type($value) `->` type($memref) `,` type($indices)
+  }];
+  let hasVerifier = 1;
+}
+
+// Raw buffer atomic floating point add
+def AMDGPU_RawBufferAtomicFaddOp :
+    AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
+      AttrSizedOperandSegments]>,
+    Arguments<(ins F32:$value,
+                   Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
+                   BoolAttr:$targetIsRDNA,
+                   Variadic<I32>:$indices,
+                   DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
+                   OptionalAttr<I32Attr>:$indexOffset,
+                   Optional<I32>:$sgprOffset)> {
+
+  let summary = "Raw Buffer Floating-point Atomic Add (MI-* only)";
+  let description = [{
+    The `amdgpu.raw_buffer_atomic_fadd` op is a wrapper around the
+    buffer-based atomic flooating point addition available on the MI-* series
+    of AMD GPUs.
+
+    The index into the buffer is computed as for `memref.store` with the addition
+    of `indexOffset` (which is used to aid in emitting vectorized code) and,
+    if present `sgprOffset` (which is added after bounds checks and includes
+    any non-zero offset on the memref type).
+
+    All indexing components are given in terms of the memref's element size, not
+    the byte lengths required by the intrinsic.
+
+    Out of bounds atomic operations are ignored in hardware.
+
+    See `amdgpu.raw_buffer_load` for a description of how the underlying
+    instruction is constructed.
+  }];
+  let assemblyFormat = [{
+    attr-dict $value `->` $memref `[` $indices `]`
+      (`sgprOffset` $sgprOffset^)? `:`
+      type($value) `->` type($memref) `,` type($indices)
+  }];
+  let hasVerifier = 1;
+}
+
+#endif // AMDGPU
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPUDialect.h
@@ -0,0 +1,27 @@
+//===- AMDGPUDialect.h - MLIR Dialect for AMDGPU ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a dialect for MLIR wrappers around AMDGPU-specific
+// intrinssics and for other AMD GPU-specific functionality.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
+#define MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc"
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AMDGPU/AMDGPU.h.inc"
+
+#endif // MLIR_DIALECT_AMDGPU_AMDGPUDIALECT_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_mlir_dialect(AMDGPU amdgpu)
+add_mlir_doc(AMDGPU AMDGPU Dialects/ -gen-dialect-doc)
+
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Affine)
+add_subdirectory(AMDGPU)
 add_subdirectory(Arithmetic)
 add_subdirectory(Async)
 add_subdirectory(ArmNeon)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -179,4 +179,57 @@
   let hasCustomAssemblyFormat = 1;
 }
 
+//===---------------------------------------------------------------------===//
+// Raw buffer load/store intrinsics
+
+def ROCDL_RawBufferLoadOp :
+  ROCDL_Op<"raw.buffer.load">,
+  Results<(outs LLVM_Type:$res)>,
+  Arguments<(ins LLVM_Type:$rsrc,
+                 LLVM_Type:$offset,
+                 LLVM_Type:$soffset,
+                 LLVM_Type:$aux)> {
+  string llvmBuilder = [{
+      $res = createIntrinsicCall(builder,
+          llvm::Intrinsic::amdgcn_raw_buffer_load, {$rsrc, $offset,
+          $soffset, $aux}, {$_resultType});
+  }];
+  let hasCustomAssemblyFormat = 1;
+}
+
+def ROCDL_RawBufferStoreOp :
+  ROCDL_Op<"raw.buffer.store">,
+  Arguments<(ins LLVM_Type:$vdata,
+                 LLVM_Type:$rsrc,
+                 LLVM_Type:$offset,
+                 LLVM_Type:$soffset,
+                 LLVM_Type:$aux)>{
+  string llvmBuilder = [{
+    auto vdataType = moduleTranslation.convertType(op.vdata().getType());
+    createIntrinsicCall(builder,
+          llvm::Intrinsic::amdgcn_raw_buffer_store, {$vdata, $rsrc,
+          $offset, $soffset, $aux}, {vdataType});
+  }];
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===---------------------------------------------------------------------===//
+// MI-100 and MI-200 buffer atomic floating point add intrinsic
+
+def ROCDL_RawBufferAtomicFAddOp :
+  ROCDL_Op<"raw.buffer.atomic.fadd">,
+  Arguments<(ins LLVM_Type:$vdata,
+                 LLVM_Type:$rsrc,
+                 LLVM_Type:$offset,
+                 LLVM_Type:$soffset,
+                 LLVM_Type:$aux)>{
+  string llvmBuilder = [{
+      auto vdataType = moduleTranslation.convertType(op.vdata().getType());
+      createIntrinsicCall(builder,
+          llvm::Intrinsic::amdgcn_raw_buffer_atomic_fadd, {$vdata, $rsrc,
+            $offset, $soffset, $aux}, {vdataType});
+  }];
+  let hasCustomAssemblyFormat = 1;
+}
+
 #endif // ROCDLIR_OPS
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_INITALLDIALECTS_H_
 #define MLIR_INITALLDIALECTS_H_
 
+#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
 #include "mlir/Dialect/AMX/AMXDialect.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
@@ -69,6 +70,7 @@
   registry.insert<acc::OpenACCDialect,
                   AffineDialect,
                   arith::ArithmeticDialect,
+                  amdgpu::AMDGPUDialect,
                   amx::AMXDialect,
                   arm_neon::ArmNeonDialect,
                   async::AsyncDialect,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -0,0 +1,266 @@
+//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
+#include "../PassDetail.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
+#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+
+using namespace mlir;
+
+namespace {
+/// Define lowering patterns for raw buffer ops
+template <typename GpuOp, typename Intrinsic>
+struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
+  using ConvertOpToLLVMPattern<GpuOp>::ConvertOpToLLVMPattern;
+
+  static constexpr uint32_t maxVectorOpWidth = 128;
+
+  Value createI32Constant(ConversionPatternRewriter &rewriter, Location loc,
+                          int32_t value) const {
+    IntegerAttr valAttr = rewriter.getI32IntegerAttr(value);
+    Type llvmI32 = this->typeConverter->convertType(rewriter.getI32Type());
+    return rewriter.create<LLVM::ConstantOp>(loc, llvmI32, valAttr);
+  }
+
+  LogicalResult
+  matchAndRewrite(GpuOp gpuOp, typename GpuOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = gpuOp.getLoc();
+    Value memref = adaptor.memref();
+    Value unconvertedMemref = gpuOp.memref();
+    MemRefType memrefType = unconvertedMemref.getType().cast<MemRefType>();
+
+    Value storeData = adaptor.getODSOperands(0)[0];
+    if (storeData == memref) // no write component to this op
+      storeData = Value();
+    Type wantedDataType;
+    if (storeData)
+      wantedDataType = storeData.getType();
+    else
+      wantedDataType = gpuOp.getODSResults(0)[0].getType();
+
+    Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
+
+    Type i32 = rewriter.getI32Type();
+    Type llvmI32 = this->typeConverter->convertType(i32);
+
+    int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8;
+    Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth);
+
+    // If we want to load a vector<NxT> with total size <= 32
+    // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32
+    // and the
+    Type llvmBufferValType = llvmWantedDataType;
+    if (auto dataVector = wantedDataType.dyn_cast<VectorType>()) {
+      uint32_t elemBits = dataVector.getElementTypeBitWidth();
+      uint32_t totalBits = elemBits * dataVector.getNumElements();
+      if (totalBits > maxVectorOpWidth)
+        return gpuOp.emitOpError(
+            "Total width of loads or stores must be no more than " +
+            Twine(maxVectorOpWidth) + " bits, but we call for " +
+            Twine(totalBits) +
+            " bits. This should've been caught in validation");
+      if (elemBits < 32) {
+        if (totalBits > 32) {
+          if (totalBits % 32 != 0)
+            return gpuOp.emitOpError("Load or store of more than 32-bits that "
+                                     "doesn't fit into words. Can't happen\n");
+          llvmBufferValType = this->typeConverter->convertType(
+              VectorType::get(totalBits / 32, i32));
+        } else {
+          llvmBufferValType = this->typeConverter->convertType(
+              rewriter.getIntegerType(totalBits));
+        }
+      }
+    }
+
+    SmallVector<Value, 6> args;
+    if (storeData) {
+      if (llvmBufferValType != llvmWantedDataType) {
+        Value castForStore =
+            rewriter.create<LLVM::BitcastOp>(loc, llvmBufferValType, storeData);
+        args.push_back(castForStore);
+      } else {
+        args.push_back(storeData);
+      }
+    }
+
+    // Construct buffer descriptor from memref, attributes
+    int64_t offset = 0;
+    SmallVector<int64_t, 5> strides;
+    if (failed(getStridesAndOffset(memrefType, strides, offset)))
+      return gpuOp.emitOpError("Can't lower non-stride-offset memrefs");
+
+    // Resource descriptor
+    // bits 0-47: base address
+    // bits 48-61: stride (0 for raw buffers)
+    // bit 62: texture cache coherency (always 0)
+    // bit 63: enable swizzles (always off for raw buffers)
+    // bits 64-95 (word 2): Number of records, units of stride
+    // bits 96-127 (word 3): See below
+
+    Type llvm4xI32 = this->typeConverter->convertType(VectorType::get(4, i32));
+    MemRefDescriptor memrefDescriptor(memref);
+    Type llvmI64 = this->typeConverter->convertType(rewriter.getI64Type());
+    Type llvm2xI32 = this->typeConverter->convertType(VectorType::get(2, i32));
+
+    Value resource = rewriter.create<LLVM::UndefOp>(loc, llvm4xI32);
+
+    Value ptr = memrefDescriptor.alignedPtr(rewriter, loc);
+    Value ptrAsInt = rewriter.create<LLVM::PtrToIntOp>(loc, llvmI64, ptr);
+    Value ptrAsInts =
+        rewriter.create<LLVM::BitcastOp>(loc, llvm2xI32, ptrAsInt);
+    for (int64_t i = 0; i < 2; ++i) {
+      Value idxConst = this->createIndexConstant(rewriter, loc, i);
+      Value part =
+          rewriter.create<LLVM::ExtractElementOp>(loc, ptrAsInts, idxConst);
+      resource = rewriter.create<LLVM::InsertElementOp>(
+          loc, llvm4xI32, resource, part, idxConst);
+    }
+
+    Value numRecords;
+    if (memrefType.hasStaticShape()) {
+      numRecords = createI32Constant(
+          rewriter, loc,
+          static_cast<int32_t>(memrefType.getNumElements() * elementByteWidth));
+    } else {
+      Value maxIndex;
+      for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) {
+        Value size = memrefDescriptor.size(rewriter, loc, i);
+        Value stride = memrefDescriptor.stride(rewriter, loc, i);
+        stride = rewriter.create<LLVM::MulOp>(loc, stride, byteWidthConst);
+        Value maxThisDim = rewriter.create<LLVM::MulOp>(loc, size, stride);
+        maxIndex = maxIndex ? rewriter.create<LLVM::MaximumOp>(loc, maxIndex,
+                                                               maxThisDim)
+                            : maxThisDim;
+      }
+      numRecords = rewriter.create<LLVM::TruncOp>(loc, llvmI32, maxIndex);
+    }
+    resource = rewriter.create<LLVM::InsertElementOp>(
+        loc, llvm4xI32, resource, numRecords,
+        this->createIndexConstant(rewriter, loc, 2));
+
+    // Final word:
+    // bits 0-11: dst sel, ignored by these intrinsics
+    // bits 12-14: data format (ignored, must be nonzero, 7=float)
+    // bits 15-18: data format (ignored, must be nonzero, 4=32bit)
+    // bit 19: In nested heap (0 here)
+    // bit 20: Behavior on unmap (0 means  "return 0 / ignore")
+    // bits 21-22: Index stride for swizzles (N/A)
+    // bit 23: Add thread ID (0)
+    // bit 24: Reserved to 1 (RDNA) or 0 (CDNA)
+    // bits 25-26: Reserved (0)
+    // bit 27: Buffer is non-volatile (CDNA only)
+    // bits 28-29: Out of bounds select (0 = structured, 1 = raw, 2 = none, 3 =
+    // swizzles) RDNA only
+    // bits 30-31: Type (must be 0)
+    uint32_t word3 = (7 << 12) | (4 << 15);
+    if (adaptor.targetIsRDNA()) {
+      word3 |= (1 << 24);
+      uint32_t oob = adaptor.boundsCheck() ? 1 : 2;
+      word3 |= (oob << 28);
+    }
+    Value word3Const = createI32Constant(rewriter, loc, word3);
+    resource = rewriter.create<LLVM::InsertElementOp>(
+        loc, llvm4xI32, resource, word3Const,
+        this->createIndexConstant(rewriter, loc, 3));
+    args.push_back(resource);
+
+    // Indexing (voffset)
+    Value voffset;
+    for (auto &pair : llvm::enumerate(adaptor.indices())) {
+      size_t i = pair.index();
+      Value index = pair.value();
+      Value strideOp;
+      if (ShapedType::isDynamicStrideOrOffset(strides[i])) {
+        strideOp = rewriter.create<LLVM::MulOp>(
+            loc, memrefDescriptor.stride(rewriter, loc, i), byteWidthConst);
+      } else {
+        strideOp =
+            createI32Constant(rewriter, loc, strides[i] * elementByteWidth);
+      }
+      index = rewriter.create<LLVM::MulOp>(loc, index, strideOp);
+      voffset =
+          voffset ? rewriter.create<LLVM::AddOp>(loc, voffset, index) : index;
+    }
+    if (adaptor.indexOffset().hasValue()) {
+      int32_t indexOffset = *gpuOp.indexOffset() * elementByteWidth;
+      Value extraOffsetConst = createI32Constant(rewriter, loc, indexOffset);
+      voffset =
+          voffset ? rewriter.create<LLVM::AddOp>(loc, voffset, extraOffsetConst)
+                  : extraOffsetConst;
+    }
+    args.push_back(voffset);
+
+    Value sgprOffset = adaptor.sgprOffset();
+    if (!sgprOffset)
+      sgprOffset = createI32Constant(rewriter, loc, 0);
+    if (ShapedType::isDynamicStrideOrOffset(offset))
+      sgprOffset = rewriter.create<LLVM::AddOp>(
+          loc, memrefDescriptor.offset(rewriter, loc), sgprOffset);
+    else if (offset > 0)
+      sgprOffset = rewriter.create<LLVM::AddOp>(
+          loc, sgprOffset, createI32Constant(rewriter, loc, offset));
+    args.push_back(sgprOffset);
+
+    // bit 0: GLC = 0 (atomics drop value, less coherency)
+    // bits 1-2: SLC, DLC = 0 (similarly)
+    // bit 3: swizzled (0 for raw)
+    args.push_back(createI32Constant(rewriter, loc, 0));
+
+    llvm::SmallVector<Type, 1> resultTypes(gpuOp->getNumResults(),
+                                           llvmBufferValType);
+    Operation *lowered = rewriter.create<Intrinsic>(loc, resultTypes, args,
+                                                    ArrayRef<NamedAttribute>());
+    if (lowered->getNumResults() == 1) {
+      Value replacement = lowered->getResults()[0];
+      if (llvmBufferValType != llvmWantedDataType) {
+        replacement = rewriter.create<LLVM::BitcastOp>(loc, llvmWantedDataType,
+                                                       replacement);
+      }
+      rewriter.replaceOp(gpuOp, replacement);
+    } else {
+      rewriter.eraseOp(gpuOp);
+    }
+    return success();
+  }
+};
+
+struct ConvertAMDGPUToROCDLPass
+    : public ConvertAMDGPUToROCDLBase<ConvertAMDGPUToROCDLPass> {
+  ConvertAMDGPUToROCDLPass() = default;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    LLVMTypeConverter converter(&getContext());
+    populateAMDGPUToROCDLConversionPatterns(converter, patterns);
+    LLVMConversionTarget target(getContext());
+    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>();
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      signalPassFailure();
+  }
+};
+} // namespace
+
+void mlir::populateAMDGPUToROCDLConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns) {
+  patterns.add<
+      RawBufferOpLowering<amdgpu::RawBufferLoadOp, ROCDL::RawBufferLoadOp>,
+      RawBufferOpLowering<amdgpu::RawBufferStoreOp, ROCDL::RawBufferStoreOp>,
+      RawBufferOpLowering<amdgpu::RawBufferAtomicFaddOp,
+                          ROCDL::RawBufferAtomicFAddOp>>(converter);
+}
+
+std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {
+  return std::make_unique<ConvertAMDGPUToROCDLPass>();
+}
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_mlir_conversion_library(MLIRAMDGPUToROCDL
+  AMDGPUToROCDL.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AMDGPUToROCDL
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRLLVMCommonConversion
+  MLIRLLVMIR
+  MLIRROCDLIR
+  MLIRAMDGPU
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(AffineToStandard)
+add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithmeticToLLVM)
 add_subdirectory(ArithmeticToSPIRV)
 add_subdirectory(ArmNeon2dToIntr)
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
 #include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(IR)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -0,0 +1,60 @@
+//===- AMDGPUDialect.cpp - MLIR AMDGPU dialect implementation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AMDGPU dialect and its operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/TypeUtilities.h"
+
+using namespace mlir;
+
+#include "mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc"
+
+void amdgpu::AMDGPUDialect::initialize() {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc"
+      >();
+}
+
+//===----------------------------------------------------------------------===//
+// RawBuffer*Op
+//===----------------------------------------------------------------------===//
+template <typename T>
+static LogicalResult verifyRawBufferOp(T &op) {
+  MemRefType bufferType = op.memref().getType().template cast<MemRefType>();
+  if (bufferType.getMemorySpaceAsInt() != 0)
+    return op.emitOpError(
+        "Buffer ops must operate on a memref in global memory");
+  if (!bufferType.hasRank())
+    return op.emitOpError(
+        "Cannot meaningfully buffer_store to an unranked memref");
+  if (static_cast<int64_t>(op.indices().size()) != bufferType.getRank())
+    return op.emitOpError("Expected " + Twine(bufferType.getRank()) +
+                          " indices to memref");
+  return success();
+}
+
+LogicalResult amdgpu::RawBufferLoadOp::verify() {
+  return verifyRawBufferOp(*this);
+}
+
+LogicalResult amdgpu::RawBufferStoreOp::verify() {
+  return verifyRawBufferOp(*this);
+}
+
+LogicalResult amdgpu::RawBufferAtomicFaddOp::verify() {
+  return verifyRawBufferOp(*this);
+}
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AMDGPU/AMDGPU.cpp.inc"
diff --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_dialect_library(MLIRAMDGPU
+  AMDGPUDialect.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU
+
+  DEPENDS
+  MLIRAMDGPUIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRSideEffectInterfaces
+  )
diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt
--- a/mlir/lib/Dialect/CMakeLists.txt
+++ b/mlir/lib/Dialect/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Affine)
+add_subdirectory(AMDGPU)
 add_subdirectory(Arithmetic)
 add_subdirectory(ArmNeon)
 add_subdirectory(ArmSVE)
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
--- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -84,6 +84,76 @@
   p << " " << getOperands() << " : " << vdata().getType();
 }
 
+// <operation> ::=
+//     `llvm.amdgcn.raw.buffer.load.* %rsrc, %offset, %soffset, %aux
+//     : result_type`
+ParseResult RawBufferLoadOp::parse(OpAsmParser &parser,
+                                   OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> ops;
+  Type type;
+  if (parser.parseOperandList(ops, 4) || parser.parseColonType(type) ||
+      parser.addTypeToList(type, result.types))
+    return failure();
+
+  auto bldr = parser.getBuilder();
+  auto int32Ty = bldr.getI32Type();
+  auto i32x4Ty = VectorType::get({4}, int32Ty);
+  return parser.resolveOperands(ops, {i32x4Ty, int32Ty, int32Ty, int32Ty},
+                                parser.getNameLoc(), result.operands);
+}
+
+void RawBufferLoadOp::print(OpAsmPrinter &p) {
+  p << " " << getOperands() << " : " << res().getType();
+}
+
+// <operation> ::=
+//     `llvm.amdgcn.raw.buffer.store.* %vdata, %rsrc,  %offset,
+//     %soffset, %aux : result_type`
+ParseResult RawBufferStoreOp::parse(OpAsmParser &parser,
+                                    OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 5> ops;
+  Type type;
+  if (parser.parseOperandList(ops, 5) || parser.parseColonType(type))
+    return failure();
+
+  auto bldr = parser.getBuilder();
+  auto int32Ty = bldr.getI32Type();
+  auto i32x4Ty = VectorType::get({4}, int32Ty);
+
+  if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty},
+                             parser.getNameLoc(), result.operands))
+    return failure();
+  return success();
+}
+
+void RawBufferStoreOp::print(OpAsmPrinter &p) {
+  p << " " << getOperands() << " : " << vdata().getType();
+}
+
+// <operation> ::=
+//     `llvm.amdgcn.raw.buffer.atomic.fadd.* %vdata, %rsrc,  %offset,
+//     %soffset, %aux : result_type`
+ParseResult RawBufferAtomicFAddOp::parse(OpAsmParser &parser,
+                                         OperationState &result) {
+  SmallVector<OpAsmParser::UnresolvedOperand, 5> ops;
+  Type type;
+  if (parser.parseOperandList(ops, 5) || parser.parseColonType(type))
+    return failure();
+
+  auto bldr = parser.getBuilder();
+  auto int32Ty = bldr.getI32Type();
+  auto i32x4Ty = VectorType::get({4}, int32Ty);
+
+  if (parser.resolveOperands(ops, {type, i32x4Ty, int32Ty, int32Ty, int32Ty},
+                             parser.getNameLoc(), result.operands))
+    return failure();
+  return success();
+}
+
+void RawBufferAtomicFAddOp::print(mlir::OpAsmPrinter &p) {
+  p << " " << getOperands() << " : " << vdata().getType();
+}
+
 //===----------------------------------------------------------------------===//
 // ROCDLDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -0,0 +1,110 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl | FileCheck %s
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32
+func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
+  // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+  // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
+  // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+  // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> i32
+  func.return %0 : i32
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna
+func.func @gpu_gcn_raw_buffer_load_i32_rdna(%buf: memref<64xi32>, %idx: i32) -> i32 {
+  // CHECK: %[[word3:.*]] = llvm.mlir.constant(285372416 : i32)
+  // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+  // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32
+  func.return %0 : i32
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off
+func.func @gpu_gcn_raw_buffer_load_i32_rdna_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
+  // CHECK: %[[word3:.*]] = llvm.mlir.constant(553807872 : i32)
+  // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+  // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = false, targetIsRDNA = true} %buf[%idx] : memref<64xi32>, i32 -> i32
+  func.return %0 : i32
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi32
+func.func @gpu_gcn_raw_buffer_load_2xi32(%buf: memref<64xi32>, %idx: i32) -> vector<2xi32> {
+  // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<2xi32>
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi32>, i32 -> vector<2xi32>
+  func.return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i8
+func.func @gpu_gcn_raw_buffer_load_i8(%buf: memref<64xi8>, %idx: i32) -> i8 {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
+  // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+  // CHECK: %[[ret:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i8
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> i8
+  func.return %0 : i8
+}
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_2xi8
+func.func @gpu_gcn_raw_buffer_load_2xi8(%buf: memref<64xi8>, %idx: i32) -> vector<2xi8> {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(64 : i32)
+  // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+  // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
+  // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : i16 to vector<2xi8>
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<2xi8>
+  func.return %0 : vector<2xi8>
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_16xi8
+func.func @gpu_gcn_raw_buffer_load_16xi8(%buf: memref<64xi8>, %idx: i32) -> vector<16xi8> {
+  // CHECK: %[[loaded:.*]] = rocdl.raw.buffer.load %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
+  // CHECK: %[[ret:.*]] = llvm.bitcast %[[loaded]] : vector<4xi32> to vector<16xi8>
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, targetIsRDNA = false} %buf[%idx] : memref<64xi8>, i32 -> vector<16xi8>
+  func.return %0 : vector<16xi8>
+}
+
+// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
+func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
+  // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+  // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
+  // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+  // CHECK: rocdl.raw.buffer.store %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
+  func.return
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_2xi8
+func.func @gpu_gcn_raw_buffer_store_2xi8(%value: vector<2xi8>, %buf: memref<64xi8>, %idx: i32) {
+  // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<2xi8> to i16
+  // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : i16
+  amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<2xi8> -> memref<64xi8>, i32
+  func.return
+}
+
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_16xi8
+func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64xi8>, %idx: i32) {
+  // CHECK: %[[cast:.*]] = llvm.bitcast %{{.*}} : vector<16xi8> to vector<4xi32>
+  // CHECK: rocdl.raw.buffer.store %[[cast]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : vector<4xi32>
+  amdgpu.raw_buffer_store {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : vector<16xi8> -> memref<64xi8>, i32
+  func.return
+}
+
+// And more so for atomic add
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
+func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
+  // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+  // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
+  // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+  // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, targetIsRDNA = false} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
+  func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -0,0 +1,61 @@
+// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt -allow-unregistered-dialect %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -allow-unregistered-dialect -mlir-print-op-generic %s | mlir-opt -allow-unregistered-dialect | FileCheck %s
+
+// CHECK-LABEL: func @raw_buffer_load_f32_from_rank_1
+func.func @raw_buffer_load_f32_from_rank_1(%src : memref<128xf32>, %offset : i32, %idx0 : i32) -> f32 {
+  // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[{{.*}}] sgprOffset %{{.*}} : memref<128xf32>, i32 -> f32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0] sgprOffset %offset : memref<128xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// CHECK-LABEL: func @raw_buffer_load_f32_from_rank_4
+func.func @raw_buffer_load_f32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> f32 {
+  // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// CHECK-LABEL: func @raw_buffer_load_4xf32_from_rank_4
+func.func @raw_buffer_load_4xf32_from_rank_4(%src : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) -> vector<4xf32> {
+  // CHECK: amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32>
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %src[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : memref<128x64x32x16xf32>, i32, i32, i32, i32 -> vector<4xf32>
+  func.return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @raw_buffer_store_f32_to_rank_1
+func.func @raw_buffer_store_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) {
+  // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32
+  amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32
+  func.return
+}
+
+// CHECK-LABEL: func @raw_buffer_store_f32_to_rank_4
+func.func @raw_buffer_store_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
+  // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  func.return
+}
+
+// CHECK-LABEL: func @raw_buffer_store_4xf32_to_rank_4
+func.func @raw_buffer_store_4xf32_to_rank_4(%value : vector<4xf32>, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
+  // CHECK: amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  amdgpu.raw_buffer_store {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : vector<4xf32> -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  func.return
+}
+
+// CHECK-LABEL: func @raw_buffer_atomic_fadd_f32_to_rank_1
+func.func @raw_buffer_atomic_fadd_f32_to_rank_1(%value : f32, %dst : memref<128xf32>, %offset : i32, %idx0 : i32) {
+  // CHECK: amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128xf32>, i32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0] sgprOffset %offset : f32 -> memref<128xf32>, i32
+  func.return
+}
+
+// CHECK-LABEL: func @raw_buffer_atomic_fadd_f32_to_rank_4
+func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
+  // CHECK: amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32, targetIsRDNA = false} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+  func.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -167,6 +167,34 @@
   llvm.return
 }
 
+llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>,
+                       %offset : i32, %soffset : i32,
+                       %aux : i32, %vdata1 : f32,
+                       %vdata2 : vector<2xf32>, %vdata4 : vector<4xf32>) {
+  // CHECK-LABEL: rocdl.raw.buffer
+  // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32
+  // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32>
+  // CHECK: %{{.*}} = rocdl.raw.buffer.load %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32>
+
+  // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32
+  // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<2xf32>
+  // CHECK: rocdl.raw.buffer.store %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : vector<4xf32>
+
+  // CHECK: rocdl.raw.buffer.atomic.fadd %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : f32
+
+  %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : f32
+  %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xf32>
+  %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xf32>
+
+  rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : f32
+  rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xf32>
+  rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %offset, %aux : vector<4xf32>
+
+  rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32
+
+  llvm.return
+}
+
 // -----
 
 // expected-error@below {{attribute attached to unexpected op}}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -169,21 +169,61 @@
   %slc = llvm.mlir.constant(true) : i1
   // CHECK-LABEL: rocdl.mubuf
   // CHECK: call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
-  %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32>
   // CHECK: call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
-  %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>
   // CHECK: call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
-  %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32>
 
   // CHECK: call void @llvm.amdgcn.buffer.store.v1f32(<1 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
-  rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32>
   // CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
-  rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>
   // CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i1 {{.*}}, i1 {{.*}})
+
+  %r1 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32>
+  %r2 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>
+  %r4 = rocdl.buffer.load %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32>
+
+  rocdl.buffer.store %vdata1, %rsrc, %vindex, %offset, %glc, %slc : vector<1xf32>
+  rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>
   rocdl.buffer.store %vdata4, %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32>
 
   llvm.return
 }
 
+llvm.func @rocdl.raw.buffer(%rsrc : vector<4xi32>,
+                        %offset : i32, %soffset : i32,
+                        %vdata1 : i32,
+                        %vdata2 : vector<2xi32>,
+                        %vdata4 : vector<4xi32>) {
+  %aux = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-LABEL: rocdl.raw.buffer
+  // CHECK: call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+  // CHECK: call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+  // CHECK: call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+  // CHECK: call void @llvm.amdgcn.raw.buffer.store.i32(i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+  // CHECK: call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+  // CHECK: call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+  %r1 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : i32
+  %r2 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<2xi32>
+  %r4 = rocdl.raw.buffer.load %rsrc, %offset, %soffset, %aux : vector<4xi32>
+
+  rocdl.raw.buffer.store %vdata1, %rsrc, %offset, %soffset, %aux : i32
+  rocdl.raw.buffer.store %vdata2, %rsrc, %offset, %soffset, %aux : vector<2xi32>
+  rocdl.raw.buffer.store %vdata4, %rsrc, %offset, %soffset, %aux : vector<4xi32>
+
+  llvm.return
+}
+
+llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>,
+                        %offset : i32, %soffset : i32,
+                        %vdata1 : f32) {
+  %aux = llvm.mlir.constant(0 : i32) : i32
+  // CHECK-LABEL: rocdl.raw.buffer.atomic
+  // CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+
+  rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32
+
+  llvm.return
+}
+
 // CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"
diff --git a/mlir/test/mlir-opt/commandline.mlir b/mlir/test/mlir-opt/commandline.mlir
--- a/mlir/test/mlir-opt/commandline.mlir
+++ b/mlir/test/mlir-opt/commandline.mlir
@@ -2,6 +2,7 @@
 // CHECK: Available Dialects:
 // CHECK-NEXT: acc
 // CHECK-NEXT: affine
+// CHECK-NEXT: amdgpu
 // CHECK-NEXT: amx
 // CHECK-NEXT: arith
 // CHECK-NEXT: arm_neon
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1148,6 +1148,69 @@
     deps = [":AffineOpsTdFiles"],
 )
 
+##---------------------------------------------------------------------------##
+# AMDGPU dialect.
+##---------------------------------------------------------------------------##
+
+td_library(
+    name = "AMDGPUTdFiles",
+    srcs = ["include/mlir/Dialect/AMDGPU/AMDGPU.td"],
+    includes = ["include"],
+    deps = [
+        ":SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "AMDGPUIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            [
+                "-gen-dialect-decls",
+                "-dialect=AMDGPU",
+            ],
+            "include/mlir/Dialect/AMDGPU/AMDGPUDialect.h.inc",
+        ),
+        (
+            [
+                "-gen-dialect-defs",
+                "-dialect=AMDGPU",
+            ],
+            "include/mlir/Dialect/AMDGPU/AMDGPUDialect.cpp.inc",
+        ),
+        (
+            ["-gen-op-decls"],
+            "include/mlir/Dialect/AMDGPU/AMDGPU.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "include/mlir/Dialect/AMDGPU/AMDGPU.cpp.inc",
+        ),
+        (
+            ["-gen-op-doc"],
+            "g3doc/Dialects/AMDGPU/AMDGPU.md",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/AMDGPU/AMDGPU.td",
+    deps = [":AMDGPUTdFiles"],
+)
+
+cc_library(
+    name = "AMDGPU",
+    srcs = ["lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp"],
+    hdrs = ["include/mlir/Dialect/AMDGPU/AMDGPUDialect.h"],
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":AMDGPUIncGen",
+        ":SideEffectInterfaces",
+        "//llvm:Core",
+        "//llvm:Support",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # EmitC dialect.
 ##---------------------------------------------------------------------------##
@@ -2384,6 +2447,7 @@
     includes = ["include"],
     deps = [
         ":AffineToStandard",
+        ":AMDGPUToROCDL",
         ":ArithmeticToLLVM",
         ":ArithmeticToSPIRV",
         ":ArmNeon2dToIntr",
@@ -3616,22 +3680,24 @@
     ],
 )
 
+
+
 cc_library(
-    name = "NVGPUToNVVM",
+    name = "AMDGPUToROCDL",
     srcs = glob([
-        "lib/Conversion/NVGPUToNVVM/*.cpp",
-        "lib/Conversion/NVGPUToNVVM/*.h",
+        "lib/Conversion/AMDGPUToROCDL/*.cpp",
+        "lib/Conversion/AMDGPUToROCDL/*.h",
     ]) + [":ConversionPassDetail"],
     hdrs = glob([
-        "include/mlir/Conversion/NVGPUToNVVM/*.h",
+        "include/mlir/Conversion/AMDGPUToROCDL/*.h",
     ]),
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMCommonConversion",
-        ":NVGPU",
-        ":NVVMDialect",
+        ":AMDGPU",
+        ":AMDGPUDialect",
         ":Pass",
         ":Transforms",
         "//llvm:Support",
@@ -6049,6 +6115,8 @@
         ":AffinePassIncGen",
         ":AffineToStandard",
         ":AffineTransforms",
+        ":AMDGPU",
+        ":AMDGPUToROCDL",
         ":ArithmeticDialect",
         ":ArithmeticToLLVM",
         ":ArithmeticToSPIRV",