diff --git a/mlir/include/mlir/Dialect/GPU/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/IR/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/IR/CMakeLists.txt
@@ -16,6 +16,11 @@
 mlir_tablegen(GPUOpsEnums.cpp.inc -gen-enum-defs)
 add_public_tablegen_target(MLIRGPUOpsEnumsGen)
 
+set(LLVM_TARGET_DEFINITIONS CompilationAttrInterfaces.td)
+mlir_tablegen(CompilationAttrInterfaces.h.inc -gen-attr-interface-decls)
+mlir_tablegen(CompilationAttrInterfaces.cpp.inc -gen-attr-interface-defs)
+add_public_tablegen_target(MLIRGPUCompilationAttrInterfacesIncGen)
+
 set(LLVM_TARGET_DEFINITIONS GPUOps.td)
 mlir_tablegen(GPUOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=gpu)
 mlir_tablegen(GPUOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=gpu)
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -0,0 +1,116 @@
+//===-- CompilationAttrInterfaces.td - GPU compilation interfaces ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces for GPU target attributes & GPU object manager
+// attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_COMPILATIONATTRINTERFACES
+#define GPU_COMPILATIONATTRINTERFACES
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// GPU target attribute interface.
+//===----------------------------------------------------------------------===//
+
+def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
+  let description = [{
+    Interface for GPU target attributes. Attributes implementing this interface
+    compile GPU modules into binary objects, providing an opaque interface to
+    hide implementation details.
+  }];
+  let cppNamespace = "::mlir::gpu";
+  let methods = [
+    InterfaceMethod<[{
+        Serializes a GPU module to a string containing a representation of the
+        module.
+
+        All attributes implementing this interface must implement this method.
+        If serialization fails then the method should return `std::nullopt`.
+
+        The `options` argument is meant to be used for passing additional
+        options that are not in the attribute.
+      }],
+      "std::optional<SmallVector<char, 0>>", "serializeToObject",
+      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)
+    >
+  ];
+}
+
+def ImplementsTargetAttrInterface : AttrConstraint<
+  CPred<"$_self.hasTrait<::mlir::gpu::TargetAttrInterface::Trait>()">,
+  "Attribute implementing the `TargetAttrInterface` interface."
+>;
+
+def GPUTargetAttr : ConfinedAttr<AnyAttr, [ImplementsTargetAttrInterface]> {
+  let description = [{
+    Generic target attribute implementing the `TargetAttrInterface` interface.
+  }];
+}
+
+def GPUTargetArrayAttr :
+  TypedArrayAttrBase<GPUTargetAttr, "Array of GPU target attributes">;
+
+def GPUNonEmptyTargetArrayAttr :
+  ConfinedAttr<GPUTargetArrayAttr, [ArrayMinCount<1>]>;
+
+//===----------------------------------------------------------------------===//
+// GPU object manager attribute interface.
+//===----------------------------------------------------------------------===//
+
+def GPUObjectManagerAttrInterface :
+    AttrInterface<"ObjectManagerAttrInterface"> {
+  let description = [{
+    Interface for GPU object manager attributes. Attributes implementing this
+    interface manage the interaction between GPU objects and host IR.
+  }];
+  let cppNamespace = "::mlir::gpu";
+  let methods = [
+    InterfaceMethod<[{
+        Embeds a GPU object into a host LLVM module. The operation expected by
+        this method must be a GPU BinaryOp.
+
+        All attributes implementing this interface must implement this method.
+        If the method fails then it must return `failure`.
+      }],
+      "LogicalResult", "embedBinary",
+      (ins "Operation*":$binaryOp, "llvm::IRBuilderBase&":$hostBuilder,
+           "LLVM::ModuleTranslation&":$hostModuleTranslation)
+    >,
+    InterfaceMethod<[{
+        Launches a kernel inside a binary. The first argument must be a GPU
+        LaunchFuncOp, while the second one a GPU BinaryOp.
+
+        All attributes implementing this interface must implement this method.
+        If the method fails then it must return `failure`.
+      }],
+      "LogicalResult", "launchKernel",
+      (ins "Operation*":$launchFunc, "Operation*":$binaryOp,
+           "llvm::IRBuilderBase&":$hostBuilder,
+           "LLVM::ModuleTranslation&":$hostModuleTranslation)
+    >
+  ];
+}
+
+def ImplementsObjectManagerAttrInterface : AttrConstraint<
+  CPred<"$_self.hasTrait<::mlir::gpu::ObjectManagerAttrInterface::Trait>()">,
+  "Attribute implementing the `ObjectManagerAttrInterface` interface."
+>;
+
+def GPUObjectManagerAttr :
+    ConfinedAttr<AnyAttr, [ImplementsObjectManagerAttrInterface]> {
+  let description = [{
+    Generic compilation attribute implementing the `ObjectManagerAttrInterface`
+    interface.
+  }];
+}
+
+#endif // GPU_COMPILATIONATTRINTERFACES
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td
@@ -0,0 +1,182 @@
+//===-- GPUTargetAttr.td - GPU compilation attributes ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the GPU NVPTX & AMGDPU target attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_COMPILATIONATTR
+#define GPU_COMPILATIONATTR
+
+include "mlir/Dialect/GPU/IR/GPUBase.td"
+include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// GPU NVPTX target attribute.
+//===----------------------------------------------------------------------===//
+
+def GPU_NVPTXTargetAttr : GPU_Attr<"NVPTXTarget", "nvptx", [
+    DeclareAttrInterfaceMethods<GPUTargetAttrInterface, [
+      "serializeToObject"
+    ]>
+  ]> {
+  let description = [{
+    NVPTX target attribute for controlling compilation of NVIDIA targets. All
+    parameters decay into default values if not present.
+
+    Examples:
+
+    1. Target with default values.
+    ```
+      gpu.module @mymodule [#gpu.nvptx] attributes {...} {
+        ...
+      }
+    ```
+
+    2. Target with `sm_90` chip and fast math.
+    ```
+      gpu.module @mymodule [#gpu.nvptx<chip = "sm_90", flags = {fast}>] {
+        ...
+      }
+    ```
+  }];
+  let parameters = (ins
+    DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O,
+    StringRefParameter<"Target triple.", "\"nvptx64-nvidia-cuda\"">:$triple,
+    StringRefParameter<"Target chip.", "\"sm_50\"">:$chip,
+    StringRefParameter<"Target chip features.", "\"+ptx60\"">:$features,
+    OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags,
+    OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link
+  );
+  let assemblyFormat = [{
+    (`<` struct($O, $triple, $chip, $features, $flags)^ `>`)?
+  }];
+  let builders = [
+    AttrBuilder<(ins CArg<"int", "2">:$optLevel,
+                     CArg<"StringRef", "\"nvptx64-nvidia-cuda\"">:$triple,
+                     CArg<"StringRef", "\"sm_50\"">:$chip,
+                     CArg<"StringRef", "\"+ptx60\"">:$features,
+                     CArg<"DictionaryAttr", "nullptr">:$targetFlags,
+                     CArg<"ArrayAttr", "nullptr">:$linkFiles), [{
+      return Base::get($_ctxt, optLevel, triple, chip, features, targetFlags, linkFiles);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let extraClassDeclaration = [{
+    bool hasFlag(StringRef flag) const;
+    bool getFastMath() const;
+    bool getFtz() const;
+  }];
+  let extraClassDefinition = [{
+    bool $cppClass::hasFlag(StringRef flag) const {
+      if (DictionaryAttr flags = getFlags())
+        return flags.get(flag) != nullptr;
+      return false;
+    }
+    bool $cppClass::getFastMath() const {
+      return hasFlag("fast");
+    }
+    bool $cppClass::getFtz() const {
+      return hasFlag("ftz");
+    }
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// GPU AMDGPU target attribute.
+//===----------------------------------------------------------------------===//
+
+def GPU_AMDGPUTargetAttr : GPU_Attr<"AMDGPUTarget", "amdgpu", [
+    DeclareAttrInterfaceMethods<GPUTargetAttrInterface, [
+      "serializeToObject"
+    ]>
+  ]> {
+  let description = [{
+    AMDGPU target attribute for controlling compilation of AMDGPU targets. All
+    parameters decay into default values if not present.
+
+    Examples:
+
+    1. Target with default values.
+    ```
+      gpu.module @mymodule [#gpu.amdgpu] attributes {...} {
+        ...
+      }
+    ```
+
+    2. Target with `gfx90a` chip and fast math.
+    ```
+      gpu.module @mymodule [#gpu.amdgpu<chip = "gfx90a", flags = {fast, no_wave64}>] {
+        ...
+      }
+    ```
+  }];
+  let parameters = (ins
+    DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O,
+    StringRefParameter<"Target triple.", "\"amdgcn-amd-amdhsa\"">:$triple,
+    StringRefParameter<"Target chip.", "\"gfx900\"">:$chip,
+    StringRefParameter<"Target chip features.", "\"\"">:$features,
+    StringRefParameter<"ABI version.", "\"500\"">:$abi,
+    OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags,
+    OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link
+  );
+  let assemblyFormat = [{
+    (`<` struct($O, $triple, $chip, $features, $abi, $flags)^ `>`)?
+  }];
+  let builders = [
+    AttrBuilder<(ins CArg<"int", "2">:$optLevel,
+                     CArg<"StringRef", "\"amdgcn-amd-amdhsa\"">:$triple,
+                     CArg<"StringRef", "\"gfx900\"">:$chip,
+                     CArg<"StringRef", "\"\"">:$features,
+                     CArg<"StringRef", "\"500\"">:$abiVersion,
+                     CArg<"DictionaryAttr", "nullptr">:$targetFlags,
+                     CArg<"ArrayAttr", "nullptr">:$linkFiles), [{
+      return Base::get($_ctxt, optLevel, triple, chip, features, abiVersion,
+                       targetFlags, linkFiles);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let extraClassDeclaration = [{
+    bool hasFlag(StringRef flag) const;
+    bool getWave64() const;
+    bool getFastMath() const;
+    bool getDaz() const;
+    bool getFiniteOnly() const;
+    bool getUnsafeMath() const;
+    bool getCorrectSqrt() const;
+  }];
+  let extraClassDefinition = [{
+    bool $cppClass::hasFlag(StringRef flag) const {
+      if (DictionaryAttr flags = getFlags())
+        return flags.get(flag) != nullptr;
+      return false;
+    }
+    bool $cppClass::getWave64() const {
+      return hasFlag("wave64") || !hasFlag("no_wave64");
+    }
+    bool $cppClass::getFastMath() const {
+      return hasFlag("fast");
+    }
+    bool $cppClass::getDaz() const {
+      return hasFlag("daz");
+    }
+    bool $cppClass::getFiniteOnly() const {
+      return hasFlag("finite_only");
+    }
+    bool $cppClass::getUnsafeMath() const {
+      return hasFlag("unsafe_math");
+    }
+    bool $cppClass::getCorrectSqrt() const {
+      return !hasFlag("unsafe_sqrt");
+    }
+  }];
+}
+
+#endif // GPU_COMPILATIONATTR
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -28,7 +28,14 @@
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/ADT/STLExtras.h"
 
+namespace llvm {
+class IRBuilderBase;
+}
+
 namespace mlir {
+namespace LLVM {
+class ModuleTranslation;
+}
 namespace gpu {
 
 /// Utility class for the GPU dialect to represent triples of `Value`s
@@ -180,9 +187,45 @@
 using SparseDnTensorHandleType = SparseHandleType<SparseHandleKind::DnTensor>;
 using SparseSpMatHandleType = SparseHandleType<SparseHandleKind::SpMat>;
 
+/// This class serves as an opaque interface for passing options to the
+/// `TargetAttrInterface` methods. Users of this class must implement the
+/// `classof` method as well as using the macros `MLIR_*_EXPLICIT_TYPE_ID` to
+/// ensure type safeness.
+class TargetOptions {
+public:
+  /// Constructor initializing the toolkit path and the list of bitcode files.
+  TargetOptions(StringRef toolkitPath = {},
+                ArrayRef<std::string> bitcodeFiles = {});
+
+  /// Returns the typeID.
+  TypeID getTypeID() const;
+
+  /// Returns the toolkit path.
+  StringRef getToolkitPath() const;
+
+  /// Returns the bitcode files to link to.
+  ArrayRef<std::string> getBitcodeFiles() const;
+
+protected:
+  /// Derived classes must use this constructor to initialize `typeID` to the
+  /// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
+  TargetOptions(TypeID typeID, StringRef toolkitPath = {},
+                ArrayRef<std::string> bitcodeFiles = {});
+
+  /// Path to the target toolkit.
+  StringRef toolkitPath;
+
+  /// List of files to link with the LLVM module.
+  ArrayRef<std::string> bitcodeFiles;
+
+private:
+  TypeID typeID;
+};
 } // namespace gpu
 } // namespace mlir
 
+MLIR_DECLARE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
+
 #include "mlir/Dialect/GPU/IR/GPUOpsEnums.h.inc"
 
 #include "mlir/Dialect/GPU/IR/GPUOpsDialect.h.inc"
@@ -191,6 +234,8 @@
 
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 
+#include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.h.inc"
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -15,6 +15,8 @@
 
 include "mlir/Dialect/DLTI/DLTIBase.td"
 include "mlir/Dialect/GPU/IR/GPUBase.td"
+include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
+include "mlir/Dialect/GPU/IR/GPUCompilationAttr.td"
 include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
 include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
 include "mlir/IR/EnumAttr.td"
@@ -429,14 +431,17 @@
   let hasVerifier = 1;
 }
 
+def LaunchIndx : AnyTypeOf<[Index, I32, I64]>;
+
 def GPU_LaunchFuncOp : GPU_Op<"launch_func",
                               [GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
                SymbolRefAttr:$kernel,
-               Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
-               Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
+               LaunchIndx:$gridSizeX, LaunchIndx:$gridSizeY, LaunchIndx:$gridSizeZ,
+               LaunchIndx:$blockSizeX, LaunchIndx:$blockSizeY, LaunchIndx:$blockSizeZ,
                Optional<I32>:$dynamicSharedMemorySize,
-               Variadic<AnyType>:$kernelOperands)>,
+               Variadic<AnyType>:$kernelOperands,
+               Optional<AnyType>:$asyncObject)>,
     Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
   let summary = "Launches a function as a GPU kernel";
 
@@ -527,7 +532,11 @@
       "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
       "ValueRange":$kernelOperands,
       CArg<"Type", "nullptr">:$asyncTokenType,
-      CArg<"ValueRange", "{}">:$asyncDependencies)>
+      CArg<"ValueRange", "{}">:$asyncDependencies)>,
+    OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
+      "KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
+      "ValueRange":$kernelOperands,
+      CArg<"Value", "nullptr">:$asyncObject)>
   ];
 
   let extraClassDeclaration = [{
@@ -557,9 +566,10 @@
 
   let assemblyFormat = [{
       custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+      (`<` $asyncObject^ type($asyncObject) `>`)?
       $kernel
-      `blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
-      `threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`
+      `blocks` `in` custom<Dim3>($gridSizeX, type($gridSizeX), $gridSizeY, type($gridSizeY), $gridSizeZ, type($gridSizeZ))
+      `threads` `in` custom<Dim3>($blockSizeX, type($blockSizeX), $blockSizeY, type($blockSizeY), $blockSizeZ, type($blockSizeZ))
       (`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
       custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
   }];
@@ -998,10 +1008,10 @@
 }
 
 def GPU_GPUModuleOp : GPU_Op<"module", [
-  DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove,
-  SymbolTable, Symbol,
-  SingleBlockImplicitTerminator<"ModuleEndOp">
-]> {
+    DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove,
+    SymbolTable, Symbol,
+    SingleBlockImplicitTerminator<"ModuleEndOp">
+  ]>, Arguments<(ins OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets)> {
   let summary = "A top level compilation unit containing code to be run on a GPU.";
   let description = [{
     GPU module contains code that is intended to be run on a GPU. A host device
@@ -1019,15 +1029,21 @@
     or not intended to be run on the separate device.
 
     ```
-      gpu.module @symbol_name {
+    gpu.module @symbol_name {
+      gpu.func {}
+        ...
+      gpu.module_end
+    }
+    gpu.module @symbol_name2 [#gpu.amdgpu] {
       gpu.func {}
         ...
       gpu.module_end
     }
-
     ```
   }];
-  let builders = [OpBuilder<(ins "StringRef":$name)>];
+  let builders = [
+    OpBuilder<(ins "StringRef":$name, CArg<"ArrayAttr", "{}">:$targets)>
+  ];
   let regions = (region SizedRegion<1>:$bodyRegion);
   let hasCustomAssemblyFormat = 1;
 
@@ -1047,6 +1063,54 @@
   let assemblyFormat = "attr-dict";
 }
 
+def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
+  let description = [{
+    A GPU object attribute pairs a GPU target with a binary string,
+    encapsulating the information of how the object was generated with the
+    object itself.
+
+    The target attribute must implement the `TargetAttrInterface` interface.
+
+    ```
+      #gpu.object<#gpu.nvptx, "...">
+    ```
+  }];
+  let parameters = (ins "Attribute":$target, "StringAttr":$object);
+  let assemblyFormat = [{`<` $target `,` $object `>`}];
+  let genVerifyDecl = 1;
+}
+
+def GPUObjectArrayAttr :
+  TypedArrayAttrBase<GPU_ObjectAttr, "An array of GPU object attributes">;
+
+def GPU_BinaryOp : GPU_Op<"binary", [Symbol, GlobalSymbol]>,
+  Arguments<(ins SymbolNameAttr:$sym_name,
+                 GPUObjectManagerAttr:$objectManager,
+                 ConfinedAttr<GPUObjectArrayAttr, [ArrayMinCount<1>]>:$objects)> {
+  let summary = "An op for storing serialized GPU binary objects.";
+  let description = [{
+    GPU binaries provide a semantic mechanism for storing GPU objects,
+    e.g. the result of compiling a GPU module to an object file.
+
+    This operation has 3 arguments:
+     - The name of the binary.
+     - An attribute implementing the `ObjectManagerAttrInterface` interface.
+     - An array of GPU object attributes.
+
+    ```
+      gpu.binary @myobject [#gpu.object<...>, #gpu.object<...>]
+    ```
+  }];
+  let builders = [
+    OpBuilder<(ins "StringRef":$name, "Attribute":$objectManager,
+                   "ArrayAttr":$objects)>
+  ];
+  let skipDefaultBuilders = 1;
+  let assemblyFormat = [{
+    $sym_name custom<ObjectManager>($objectManager) attr-dict-with-keyword $objects
+  }];
+}
+
 def GPU_HostRegisterOp : GPU_Op<"host_register">,
     Arguments<(ins AnyUnrankedMemRef:$value)> {
   let summary = "Registers a memref for access from device.";
diff --git a/mlir/include/mlir/ExecutionEngine/ModuleToObject.h b/mlir/include/mlir/ExecutionEngine/ModuleToObject.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/ModuleToObject.h
@@ -0,0 +1,117 @@
+//===- ModuleToObject.h - Module to object base class -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the base class for transforming Operations into binary
+// objects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_MODULETOOBJECT_H
+#define MLIR_EXECUTIONENGINE_MODULETOOBJECT_H
+
+#include "mlir/IR/Operation.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+class TargetMachine;
+} // namespace llvm
+
+namespace mlir {
+namespace LLVM {
+class ModuleTranslation;
+}
+/// Utility base class for transforming Operations into binary objects, by
+/// default it returns the serialized bitcode for the module.
+class ModuleToObject {
+public:
+  ModuleToObject(Operation &module, StringRef triple, StringRef chip,
+                 StringRef features = {}, int optLevel = 3);
+  virtual ~ModuleToObject() = default;
+
+  /// Returns the gpu.module being serialized.
+  Operation &getOperation();
+
+  /// Runs the serialization pipeline, returning `std::nullopt` on error.
+  virtual std::optional<SmallVector<char, 0>> run();
+
+protected:
+  // Hooks to be implemented by derived classes.
+
+  /// Hook for loading bitcode files, returns std::nullopt on failure.
+  virtual std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) {
+    return SmallVector<std::unique_ptr<llvm::Module>>();
+  }
+
+  /// Hook for performing additional actions on a loaded bitcode file.
+  virtual void handleBitcodeFile(llvm::Module &module,
+                                 llvm::TargetMachine &targetMachine) {}
+
+  /// Hook for performing additional actions on the llvmModule pre linking.
+  virtual void handleModulePreLink(llvm::Module &module,
+                                   llvm::TargetMachine &targetMachine) {}
+
+  /// Hook for performing additional actions on the llvmModule post linking.
+  virtual void handleModulePostLink(llvm::Module &module,
+                                    llvm::TargetMachine &targetMachine) {}
+
+  /// Serializes the LLVM IR bitcode to an object file, by default it serializes
+  /// to LLVM bitcode.
+  virtual std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine);
+
+protected:
+  /// Create the target machine based on the target triple and chip.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
+  /// Loads a bitcode file from path.
+  std::unique_ptr<llvm::Module> loadBitcodeFile(llvm::LLVMContext &context,
+                                                StringRef path);
+
+  /// Loads multiple bitcode files.
+  LogicalResult loadBitcodeFilesFromList(
+      llvm::LLVMContext &context, ArrayRef<std::string> fileList,
+      SmallVector<std::unique_ptr<llvm::Module>> &llvmModules,
+      bool failureOnError = true);
+
+  /// Translates the gpu.module to LLVM IR.
+  std::unique_ptr<llvm::Module>
+  translateToLLVMIR(llvm::LLVMContext &llvmContext);
+
+  /// Link the llvmModule to other bitcode file.
+  LogicalResult linkFiles(llvm::Module &module,
+                          SmallVector<std::unique_ptr<llvm::Module>> &&libs);
+
+  /// Optimize the module.
+  LogicalResult optimizeModule(llvm::Module &module,
+                               llvm::TargetMachine &targetMachine, int optL);
+
+  /// Utility function for translating to ISA, returns `std::nullopt` on
+  /// failure.
+  static std::optional<std::string>
+  translateToISA(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine);
+
+protected:
+  /// Module to transform to a binary object.
+  Operation &module;
+
+  /// Target triple.
+  StringRef triple;
+
+  /// Target chip.
+  StringRef chip;
+
+  /// Target features.
+  StringRef features;
+
+  /// Optimization level.
+  int optLevel;
+};
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_MODULETOOBJECT_H
diff --git a/mlir/include/mlir/IR/AttrTypeBase.td b/mlir/include/mlir/IR/AttrTypeBase.td
--- a/mlir/include/mlir/IR/AttrTypeBase.td
+++ b/mlir/include/mlir/IR/AttrTypeBase.td
@@ -342,11 +342,12 @@
 }
 
 // For StringRefs, which require allocation.
-class StringRefParameter<string desc = ""> :
+class StringRefParameter<string desc = "", string value = ""> :
     AttrOrTypeParameter<"::llvm::StringRef", desc> {
   let allocator = [{$_dst = $_allocator.copyInto($_self);}];
   let printer = [{$_printer << '"' << $_self << '"';}];
   let cppStorageType = "std::string";
+  let defaultValue = value;
 }
 
 // For APFloats, which require comparison.
diff --git a/mlir/include/mlir/IR/SymbolInterfaces.td b/mlir/include/mlir/IR/SymbolInterfaces.td
--- a/mlir/include/mlir/IR/SymbolInterfaces.td
+++ b/mlir/include/mlir/IR/SymbolInterfaces.td
@@ -219,6 +219,9 @@
 // Symbol Traits
 //===----------------------------------------------------------------------===//
 
+// Op defines a global symbol.
+def GlobalSymbol : NativeOpTrait<"GlobalSymbol", [Symbol]>;
+
 // Op defines a symbol table.
 def SymbolTable : NativeOpTrait<"SymbolTable">;
 
diff --git a/mlir/include/mlir/IR/SymbolTable.h b/mlir/include/mlir/IR/SymbolTable.h
--- a/mlir/include/mlir/IR/SymbolTable.h
+++ b/mlir/include/mlir/IR/SymbolTable.h
@@ -448,4 +448,22 @@
 /// Include the generated symbol interfaces.
 #include "mlir/IR/SymbolInterfaces.h.inc"
 
+namespace mlir {
+namespace OpTrait {
+/// A trait used to indicate that a symbol has a global scope. Operations using
+/// this trait must also adhere to the constraints defined by the `Symbol`
+/// trait.
+template <typename ConcreteType>
+class GlobalSymbol : public TraitBase<ConcreteType, GlobalSymbol> {
+  static LogicalResult verifyTrait(Operation *op) {
+    static_assert(ConcreteType::template hasTrait<ZeroResults>(),
+                  "expected operation must have zero results");
+    static_assert(ConcreteType::template hasTrait<SymbolOpInterface::Trait>(),
+                  "expected operation must inherit the `Symbol` trait");
+    return success();
+  }
+};
+} // namespace OpTrait
+} // namespace mlir
+
 #endif // MLIR_IR_SYMBOLTABLE_H
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -32,6 +32,7 @@
   MLIRGPUOpsAttributesIncGen
   MLIRGPUOpsEnumsGen
   MLIRGPUOpInterfacesIncGen
+  MLIRGPUCompilationAttrInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
@@ -41,6 +42,33 @@
   MLIRMemRefDialect
   MLIRSideEffectInterfaces
   MLIRSupport
+
+  PRIVATE
+  MLIRGPUTargets
+  )
+
+add_mlir_dialect_library(MLIRGPUTargets
+  Targets/AMDGPUTarget.cpp
+  Targets/NVPTXTarget.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
+
+  LINK_COMPONENTS
+  Core
+  MC
+  Target
+  ${NVPTX_LIBS}
+  ${AMDGPU_LIBS}
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRExecutionEngineUtils
+  MLIRSupport
+  MLIRTargetLLVMIRExport
+
+  PRIVATE
+  MLIRGPUDialect
   )
 
 add_mlir_dialect_library(MLIRGPUTransforms
@@ -128,6 +156,35 @@
     ${CUDA_DRIVER_LIBRARY}
   )
 
+  # Find the CUDA toolkit.
+  if (NOT DEFINED CUDAToolkit_ROOT)
+    find_package(CUDAToolkit)
+    get_filename_component(CUDAToolkit_ROOT ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+  endif()
+  message(VERBOSE "MLIR Default CUDA toolkit path: ${CUDAToolkit_ROOT}")
+
+  # Enable the gpu to cubin target.
+  target_compile_definitions(obj.MLIRGPUTargets
+    PRIVATE
+    MLIR_GPU_NVPTX_TARGET_ENABLED=1
+    __DEFAULT_CUDATOOLKIT_PATH__="${CUDAToolkit_ROOT}"
+  )
+  # Enable the gpu to cubin target.
+  target_compile_definitions(obj.MLIRGPUTransforms
+    PRIVATE
+    MLIR_GPU_NVPTX_TARGET_ENABLED=1
+  )
+
+  # Add CUDA headers includes and the libcuda.so library.
+  target_include_directories(obj.MLIRGPUTargets
+    PRIVATE
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  target_link_libraries(MLIRGPUTargets
+    PRIVATE
+    ${CUDA_DRIVER_LIBRARY}
+  )
+
 endif()
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
@@ -136,13 +193,32 @@
       "Building mlir with ROCm support requires the AMDGPU backend")
   endif()
 
-  set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+  if (DEFINED ROCM_PATH)
+    set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
+  elseif(DEFINED ENV{ROCM_PATH})
+    set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
+  else()
+    set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+  endif()
+  message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}")
+
   target_compile_definitions(obj.MLIRGPUTransforms
     PRIVATE
     __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
     MLIR_GPU_TO_HSACO_PASS_ENABLE=1
   )
 
+  # Enable the gpu to amdgpu target.
+  target_compile_definitions(obj.MLIRGPUTargets
+    PRIVATE
+    MLIR_GPU_AMDGPU_TARGET_ENABLED=1
+    __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
+  )
+  target_compile_definitions(obj.MLIRGPUTransforms
+    PRIVATE
+    MLIR_GPU_AMDGPU_TARGET_ENABLED=1
+  )
+
   target_link_libraries(MLIRGPUTransforms
     PRIVATE
     MLIRROCDLToLLVMIRTranslation
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -275,11 +275,21 @@
       return success();
 
     // Check that `launch_func` refers to a well-formed GPU kernel module.
-    StringAttr kernelModuleName = launchOp.getKernelModuleName();
-    auto kernelModule = module.lookupSymbol<GPUModuleOp>(kernelModuleName);
+    StringAttr kernelContainerName = launchOp.getKernelModuleName();
+    Operation *kernelContainer = module.lookupSymbol(kernelContainerName);
+    if (!kernelContainer)
+      return launchOp.emitOpError()
+             << "kernel container '" << kernelContainerName.getValue()
+             << "' is undefined";
+
+    // If the container is a GPU binary op return success.
+    if (isa<BinaryOp>(kernelContainer))
+      return success();
+
+    auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer);
     if (!kernelModule)
       return launchOp.emitOpError()
-             << "kernel module '" << kernelModuleName.getValue()
+             << "kernel module '" << kernelContainerName.getValue()
              << "' is undefined";
 
     // Check that `launch_func` refers to a well-formed kernel function.
@@ -937,10 +947,36 @@
       SymbolRefAttr::get(kernelModule.getNameAttr(),
                          {SymbolRefAttr::get(kernelFunc.getNameAttr())});
   result.addAttribute(getKernelAttrName(result.name), kernelSymbol);
-  SmallVector<int32_t, 9> segmentSizes(9, 1);
+  SmallVector<int32_t, 9> segmentSizes(10, 1);
   segmentSizes.front() = asyncDependencies.size();
-  segmentSizes[segmentSizes.size() - 2] = dynamicSharedMemorySize ? 1 : 0;
-  segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
+  segmentSizes[segmentSizes.size() - 3] = dynamicSharedMemorySize ? 1 : 0;
+  segmentSizes[segmentSizes.size() - 2] =
+      static_cast<int32_t>(kernelOperands.size());
+  segmentSizes.back() = 0;
+  result.addAttribute(getOperandSegmentSizeAttr(),
+                      builder.getDenseI32ArrayAttr(segmentSizes));
+}
+
+void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
+                         SymbolRefAttr kernel, KernelDim3 gridSize,
+                         KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
+                         ValueRange kernelOperands, Value asyncObject) {
+  // Add grid and block sizes as op operands, followed by the data operands.
+  result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
+                      getBlockSize.y, getBlockSize.z});
+  if (dynamicSharedMemorySize)
+    result.addOperands(dynamicSharedMemorySize);
+  result.addOperands(kernelOperands);
+  if (asyncObject)
+    result.addOperands(asyncObject);
+  result.addAttribute(getKernelAttrName(result.name), kernel);
+  SmallVector<int32_t, 9> segmentSizes(10, 1);
+  segmentSizes.front() = 0;
+  segmentSizes[segmentSizes.size() - 3] = dynamicSharedMemorySize ? 1 : 0;
+  segmentSizes[segmentSizes.size() - 2] =
+      static_cast<int32_t>(kernelOperands.size());
+
+  segmentSizes.back() = asyncObject ? 1 : 0;
   result.addAttribute(getOperandSegmentSizeAttr(),
                       builder.getDenseI32ArrayAttr(segmentSizes));
 }
@@ -982,9 +1018,51 @@
                        GPUDialect::getContainerModuleAttrName() +
                        "' attribute");
 
+  KernelDim3 grid = getGridSizeOperandValues();
+  KernelDim3 block = getBlockSizeOperandValues();
+  if (grid.x.getType() != grid.y.getType() ||
+      grid.x.getType() != grid.z.getType() ||
+      grid.x.getType() != block.x.getType() ||
+      grid.x.getType() != block.y.getType() ||
+      grid.x.getType() != block.z.getType())
+    return emitOpError(
+        "expected the grid and block sizes all having the same type");
+  return success();
+}
+
+static ParseResult
+parseDim3(OpAsmParser &parser, OpAsmParser::UnresolvedOperand &sizeX,
+          Type &sizeXTy, OpAsmParser::UnresolvedOperand &sizeY, Type &sizeYTy,
+          OpAsmParser::UnresolvedOperand &sizeZ, Type &sizeZTy) {
+  if (parser.parseLParen() || parser.parseOperand(sizeX) ||
+      parser.parseComma() || parser.parseOperand(sizeY) ||
+      parser.parseComma() || parser.parseOperand(sizeZ) || parser.parseRParen())
+    return failure();
+  SmallVector<Type, 1> types;
+  if (failed(parser.parseOptionalColonTypeList(types)))
+    return failure();
+  if (types.size()) {
+    sizeXTy = types[0];
+    sizeYTy = types[0];
+    sizeZTy = types[0];
+  } else {
+    types.push_back(IndexType::get(parser.getContext()));
+    sizeXTy = types[0];
+    sizeYTy = types[0];
+    sizeZTy = types[0];
+  }
   return success();
 }
 
+static void printDim3(OpAsmPrinter &printer, Operation *op, Value sizeX,
+                      Type sizeXTy, Value sizeY, Type sizeYTy, Value sizeZ,
+                      Type sizeZTy) {
+  printer << '(' << sizeX << ", " << sizeY << ", " << sizeZ << ')';
+  IndexType indexType = IndexType::get(op->getContext());
+  if (indexType != sizeXTy)
+    printer << " : " << sizeXTy;
+}
+
 static ParseResult parseLaunchFuncOperands(
     OpAsmParser &parser,
     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &argNames,
@@ -1456,18 +1534,35 @@
 //===----------------------------------------------------------------------===//
 
 void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
-                        StringRef name) {
+                        StringRef name, ArrayAttr targets) {
   ensureTerminator(*result.addRegion(), builder, result.location);
   result.attributes.push_back(builder.getNamedAttr(
       ::mlir::SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
+
+  if (targets)
+    result.getOrAddProperties<Properties>().targets = targets;
 }
 
 ParseResult GPUModuleOp::parse(OpAsmParser &parser, OperationState &result) {
   StringAttr nameAttr;
+  ArrayAttr targetsAttr;
+
   if (parser.parseSymbolName(nameAttr, mlir::SymbolTable::getSymbolAttrName(),
-                             result.attributes) ||
-      // If module attributes are present, parse them.
-      parser.parseOptionalAttrDictWithKeyword(result.attributes))
+                             result.attributes))
+    return failure();
+
+  // Parse the optional array of target attributes.
+  OptionalParseResult targetsAttrResult =
+      parser.parseOptionalAttribute(targetsAttr, Type{});
+  if (targetsAttrResult.has_value()) {
+    if (failed(*targetsAttrResult)) {
+      return failure();
+    }
+    result.getOrAddProperties<GPUModuleOp::Properties>().targets = targetsAttr;
+  }
+
+  // If module attributes are present, parse them.
+  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
     return failure();
 
   // Parse the module body.
@@ -1483,13 +1578,56 @@
 void GPUModuleOp::print(OpAsmPrinter &p) {
   p << ' ';
   p.printSymbolName(getName());
-  p.printOptionalAttrDictWithKeyword((*this)->getAttrs(),
-                                     {mlir::SymbolTable::getSymbolAttrName()});
+
+  if (Attribute attr = getTargetsAttr()) {
+    p << ' ';
+    p.printAttribute(attr);
+    p << ' ';
+  }
+
+  p.printOptionalAttrDictWithKeyword(
+      (*this)->getAttrs(),
+      {mlir::SymbolTable::getSymbolAttrName(), getTargetsAttrName()});
   p << ' ';
   p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
                 /*printBlockTerminators=*/false);
 }
 
+//===----------------------------------------------------------------------===//
+// GPUBinaryOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+ObjectAttr::verify(function_ref<mlir::InFlightDiagnostic()> emitError,
+                   Attribute target, StringAttr object) {
+  if (target && target.hasTrait<TargetAttrInterface::Trait>())
+    return success();
+  emitError() << "The target parameter must implement `TargetAttrInterface`.";
+  return failure();
+}
+
+void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name,
+                     Attribute manager, ArrayAttr objects) {
+  auto &properties = result.getOrAddProperties<Properties>();
+  result.attributes.push_back(builder.getNamedAttr(
+      SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)));
+  properties.objects = objects;
+  properties.objectManager = manager;
+}
+
+static ParseResult parseObjectManager(OpAsmParser &parser,
+                                      Attribute &objectManager) {
+  if (parser.parseAttribute(objectManager))
+    return failure();
+  return success();
+}
+
+static void printObjectManager(OpAsmPrinter &printer, Operation *op,
+                               Attribute objectManager) {
+  if (objectManager)
+    printer << '<' << objectManager << '>';
+}
+
 //===----------------------------------------------------------------------===//
 // GPUMemcpyOp
 //===----------------------------------------------------------------------===//
@@ -1772,6 +1910,28 @@
   results.add<SimplifyDimOfAllocOp>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// GPU Compilation options
+//===----------------------------------------------------------------------===//
+
+TargetOptions::TargetOptions(StringRef toolkitPath,
+                             ArrayRef<std::string> bitcodeFiles)
+    : TargetOptions(TypeID::get<TargetOptions>(), toolkitPath, bitcodeFiles) {}
+
+TargetOptions::TargetOptions(TypeID typeID, StringRef toolkitPath,
+                             ArrayRef<std::string> bitcodeFiles)
+    : toolkitPath(toolkitPath), bitcodeFiles(bitcodeFiles), typeID(typeID) {}
+
+TypeID TargetOptions::getTypeID() const { return typeID; }
+
+StringRef TargetOptions::getToolkitPath() const { return toolkitPath; }
+
+ArrayRef<std::string> TargetOptions::getBitcodeFiles() const {
+  return bitcodeFiles;
+}
+
+MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
+
 #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
 #include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc"
 
@@ -1780,3 +1940,5 @@
 
 #define GET_OP_CLASSES
 #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
+
+#include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp b/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Targets/AMDGPUTarget.cpp
@@ -0,0 +1,406 @@
+//===- AMDGPUTarget.cpp - MLIR GPU Dialect AMDGPU target attribute --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files implements the AMDGPU target attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+#ifdef MLIR_GPU_AMDGPU_TARGET_ENABLED
+#include "mlir/ExecutionEngine/ModuleToObject.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/TargetParser/TargetParser.h"
+
+#ifndef __DEFAULT_ROCM_PATH__
+#define __DEFAULT_ROCM_PATH__ ""
+#endif
+
+#define DEBUG_TYPE "serialize-to-object"
+
+namespace {
+struct InitTarget {
+  InitTarget() {
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmParser();
+    LLVMInitializeAMDGPUAsmPrinter();
+  }
+};
+
+class SerializeToHSA : public ModuleToObject {
+public:
+  SerializeToHSA(Operation &module, AMDGPUTargetAttr target,
+                 TargetOptions targetOptions = {});
+
+  // Init the target.
+  static void init();
+
+  // Get the paths of ROCm device libraries. Function adapted from:
+  // https://github.com/llvm/llvm-project/blob/main/clang/lib/Driver/ToolChains/AMDGPU.cpp
+  void getCommonBitcodeLibs(llvm::SmallVector<std::string> &libs,
+                            SmallVector<char, 256> &libPath,
+                            StringRef isaVersion, bool wave64, bool daz,
+                            bool finiteOnly, bool unsafeMath, bool fastMath,
+                            bool correctSqrt, StringRef abiVer);
+
+  // Removes unnecessary metadata from the loaded bitcode files.
+  void handleBitcodeFile(llvm::Module &module,
+                         llvm::TargetMachine &targetMachine) override;
+  // Assembles the object.
+  std::optional<SmallVector<char, 0>> assembleIsa(StringRef isa);
+
+  // Create the HSACO object.
+  std::optional<SmallVector<char, 0>> createHsaco(SmallVector<char, 0> &&ptx);
+
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override;
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule,
+                 llvm::TargetMachine &targetMachine) override;
+
+private:
+  AMDGPUTargetAttr target;
+  StringRef toolkitPath;
+  SmallVector<std::string> fileList;
+};
+} // namespace
+
+SerializeToHSA::SerializeToHSA(Operation &module, AMDGPUTargetAttr target,
+                               TargetOptions targetOptions)
+    : ModuleToObject(module, target.getTriple(), target.getChip(),
+                     target.getFeatures(), target.getO()),
+      target(target), toolkitPath(targetOptions.getToolkitPath()),
+      fileList(targetOptions.getBitcodeFiles()) {
+  if (toolkitPath.empty())
+    toolkitPath = __DEFAULT_ROCM_PATH__;
+
+  if (ArrayAttr files = target.getLink())
+    for (Attribute attr : files.getValue())
+      if (auto file = dyn_cast<StringAttr>(attr))
+        fileList.push_back(file.str());
+}
+
+void SerializeToHSA::init() { static InitTarget target = InitTarget(); }
+
+void SerializeToHSA::getCommonBitcodeLibs(llvm::SmallVector<std::string> &libs,
+                                          SmallVector<char, 256> &libPath,
+                                          StringRef isaVersion, bool wave64,
+                                          bool daz, bool finiteOnly,
+                                          bool unsafeMath, bool fastMath,
+                                          bool correctSqrt, StringRef abiVer) {
+  auto addLib = [&](StringRef path) {
+    if (!llvm::sys::fs::is_regular_file(path)) {
+      getOperation().emitRemark() << "Bitcode library path: " << path
+                                  << " does not exist or is not a file.\n";
+      return;
+    }
+    libs.push_back(path.str());
+  };
+  auto optLib = [](StringRef name, bool on) -> Twine {
+    return name + (on ? "_on" : "_off");
+  };
+  auto getLibPath = [&libPath](Twine lib) {
+    auto baseSize = libPath.size();
+    llvm::sys::path::append(libPath, lib + ".bc");
+    std::string path(StringRef(libPath.data(), libPath.size()).str());
+    libPath.truncate(baseSize);
+    return path;
+  };
+
+  // Add ROCm device libraries.
+  addLib(getLibPath("ocml"));
+  addLib(getLibPath("ockl"));
+  addLib(getLibPath(optLib("oclc_daz_opt", daz)));
+  addLib(getLibPath(optLib("oclc_unsafe_math", unsafeMath || fastMath)));
+  addLib(getLibPath(optLib("oclc_finite_only", finiteOnly || fastMath)));
+  addLib(getLibPath(optLib("oclc_correctly_rounded_sqrt", correctSqrt)));
+  addLib(getLibPath(optLib("oclc_wavefrontsize64", wave64)));
+  addLib(getLibPath("oclc_isa_version_" + isaVersion));
+  if (abiVer.size())
+    addLib(getLibPath("oclc_abi_version_" + abiVer));
+}
+
+std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+SerializeToHSA::loadBitcodeFiles(llvm::LLVMContext &context,
+                                 llvm::Module &module) {
+  // Try loading device libraries from the ROCm toolkit installation.
+  StringRef pathRef = toolkitPath;
+  if (pathRef.size()) {
+    SmallVector<char, 256> path;
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    llvm::sys::path::append(path, "amdgcn", "bitcode");
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_directory(pathRef)) {
+      getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
+                                  << " does not exist or is not a directory.";
+      return std::nullopt;
+    }
+    StringRef isaVersion =
+        llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
+    isaVersion.consume_front("gfx");
+    getCommonBitcodeLibs(fileList, path, isaVersion, target.getWave64(),
+                         target.getDaz(), target.getFiniteOnly(),
+                         target.getUnsafeMath(), target.getFastMath(),
+                         target.getCorrectSqrt(), target.getAbi());
+  }
+
+  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true)))
+    return std::nullopt;
+  return bcFiles;
+}
+
+void SerializeToHSA::handleBitcodeFile(llvm::Module &module,
+                                       llvm::TargetMachine &targetMachine) {
+  // Some ROCM builds don't strip this like they should
+  if (auto *openclVersion = module.getNamedMetadata("opencl.ocl.version"))
+    module.eraseNamedMetadata(openclVersion);
+  // Stop spamming us with clang version numbers
+  if (auto *ident = module.getNamedMetadata("llvm.ident"))
+    module.eraseNamedMetadata(ident);
+}
+
+//===----------------------------------------------------------------------===//
+// AMDGPU pipeline methods.
+//===----------------------------------------------------------------------===//
+#include "mlir/Support/FileUtilities.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Program.h"
+
+std::optional<SmallVector<char, 0>> SerializeToHSA::assembleIsa(StringRef isa) {
+  auto loc = getOperation().getLoc();
+
+  StringRef targetTriple = this->triple;
+
+  SmallVector<char, 0> result;
+  llvm::raw_svector_ostream os(result);
+
+  llvm::Triple triple(llvm::Triple::normalize(targetTriple));
+  std::string error;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple.normalize(), error);
+  if (!target) {
+    emitError(loc, Twine("failed to lookup target: ") + error);
+    return std::nullopt;
+  }
+
+  llvm::SourceMgr srcMgr;
+  srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), SMLoc());
+
+  const llvm::MCTargetOptions mcOptions;
+  std::unique_ptr<llvm::MCRegisterInfo> mri(
+      target->createMCRegInfo(targetTriple));
+  std::unique_ptr<llvm::MCAsmInfo> mai(
+      target->createMCAsmInfo(*mri, targetTriple, mcOptions));
+  mai->setRelaxELFRelocations(true);
+  std::unique_ptr<llvm::MCSubtargetInfo> sti(
+      target->createMCSubtargetInfo(targetTriple, chip, features));
+
+  llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
+                      &mcOptions);
+  std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
+      ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
+  ctx.setObjectFileInfo(mofi.get());
+
+  SmallString<128> cwd;
+  if (!llvm::sys::fs::current_path(cwd))
+    ctx.setCompilationDir(cwd);
+
+  std::unique_ptr<llvm::MCStreamer> mcStreamer;
+  std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
+
+  llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, ctx);
+  llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
+  mcStreamer.reset(target->createMCObjectStreamer(
+      triple, ctx, std::unique_ptr<llvm::MCAsmBackend>(mab),
+      mab->createObjectWriter(os), std::unique_ptr<llvm::MCCodeEmitter>(ce),
+      *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ false));
+  mcStreamer->setUseAssemblerInfoForParsing(true);
+
+  std::unique_ptr<llvm::MCAsmParser> parser(
+      createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai));
+  std::unique_ptr<llvm::MCTargetAsmParser> tap(
+      target->createMCAsmParser(*sti, *parser, *mcii, mcOptions));
+
+  if (!tap) {
+    emitError(loc, "assembler initialization error");
+    return {};
+  }
+
+  parser->setTargetParser(*tap);
+  parser->Run(false);
+
+  return result;
+}
+
+std::optional<SmallVector<char, 0>>
+SerializeToHSA::createHsaco(SmallVector<char, 0> &&ptx) {
+  SmallVector<char, 0> isaBinary = std::move(ptx);
+  auto loc = getOperation().getLoc();
+
+  // Save the ISA binary to a temp file.
+  int tempIsaBinaryFd = -1;
+  SmallString<128> tempIsaBinaryFilename;
+  if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd,
+                                         tempIsaBinaryFilename)) {
+    emitError(loc, "temporary file for ISA binary creation error");
+    return {};
+  }
+  llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
+  llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true);
+  tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size());
+  tempIsaBinaryOs.close();
+
+  // Create a temp file for HSA code object.
+  int tempHsacoFD = -1;
+  SmallString<128> tempHsacoFilename;
+  if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD,
+                                         tempHsacoFilename)) {
+    emitError(loc, "temporary file for HSA code object creation error");
+    return {};
+  }
+  llvm::FileRemover cleanupHsaco(tempHsacoFilename);
+
+  llvm::SmallString<32> lldPath(toolkitPath);
+  llvm::sys::path::append(lldPath, "llvm", "bin", "ld.lld");
+  int lldResult = llvm::sys::ExecuteAndWait(
+      lldPath,
+      {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
+  if (lldResult != 0) {
+    emitError(loc, "lld invocation error");
+    return {};
+  }
+
+  // Load the HSA code object.
+  auto hsacoFile = openInputFile(tempHsacoFilename);
+  if (!hsacoFile) {
+    emitError(loc, "read HSA code object from temp file error");
+    return {};
+  }
+
+  StringRef buffer = hsacoFile->getBuffer();
+
+  return SmallVector<char, 0>(buffer.begin(), buffer.end());
+}
+
+std::optional<SmallVector<char, 0>>
+SerializeToHSA::moduleToObject(llvm::Module &llvmModule,
+                               llvm::TargetMachine &targetMachine) {
+  std::optional<std::string> serializedISA =
+      translateToISA(llvmModule, targetMachine);
+  if (!serializedISA) {
+    getOperation().emitError() << "Failed translating the module to ISA.";
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "ISA for module: "
+                 << dyn_cast<GPUModuleOp>(&getOperation()).getNameAttr()
+                 << "\n";
+    llvm::dbgs() << *serializedISA << "\n";
+    llvm::dbgs().flush();
+  });
+
+  std::optional<SmallVector<char, 0>> assembledIsa =
+      assembleIsa(serializedISA.value());
+
+  if (!assembledIsa) {
+    getOperation().emitError() << "Failed during ISA assembling.";
+    return std::nullopt;
+  }
+
+  return createHsaco(std::move(assembledIsa.value()));
+}
+
+std::optional<SmallVector<char, 0>>
+AMDGPUTargetAttr::serializeToObject(Operation *module,
+                                    const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  SerializeToHSA::init();
+  SerializeToHSA serializer(*module, *this, options);
+  return serializer.run();
+}
+
+#else
+// Provide a null vector for testing purposes.
+std::optional<SmallVector<char, 0>>
+AMDGPUTargetAttr::serializeToObject(Operation *module,
+                                    const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  return SmallVector<char, 0>{};
+}
+#endif // MLIR_GPU_AMDGPU_TARGET_ENABLED
+
+LogicalResult
+AMDGPUTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                         int optLevel, StringRef triple, StringRef chip,
+                         StringRef features, StringRef abiVersion,
+                         DictionaryAttr flags, ArrayAttr files) {
+  if (optLevel < 0 || optLevel > 3) {
+    emitError() << "The optimization level must be a number between 0 and 3.";
+    return failure();
+  }
+  if (triple.empty()) {
+    emitError() << "The target triple cannot be empty.";
+    return failure();
+  }
+  if (chip.empty()) {
+    emitError() << "The target chip cannot be empty.";
+    return failure();
+  }
+  if (abiVersion != "400" && abiVersion != "500") {
+    emitError() << "Invalid ABI version, it must be either `400` or `500`.";
+    return failure();
+  }
+  if (files && llvm::all_of(files, [](::mlir::Attribute attr) {
+        return attr && mlir::isa<StringAttr>(attr);
+      })) {
+    emitError() << "All the elements in the `link` array must be strings.";
+    return failure();
+  }
+  return success();
+}
diff --git a/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp b/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp
@@ -0,0 +1,254 @@
+//===- NVPTXTarget.cpp - MLIR GPU Dialect NVPTX target attribute ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files implements the NVPTX target attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+#ifdef MLIR_GPU_NVPTX_TARGET_ENABLED
+#include "mlir/ExecutionEngine/ModuleToObject.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetSelect.h"
+
+#ifndef __DEFAULT_CUDATOOLKIT_PATH__
+#define __DEFAULT_CUDATOOLKIT_PATH__ ""
+#endif
+
+#define DEBUG_TYPE "serialize-to-object"
+
+#include <cuda.h>
+
+static void emitCudaError(const llvm::Twine &expr, const char *buffer,
+                          CUresult result, Location loc) {
+  const char *error;
+  cuGetErrorString(result, &error);
+  emitError(loc, expr.concat(" failed with error code ")
+                     .concat(llvm::Twine{error})
+                     .concat("[")
+                     .concat(buffer)
+                     .concat("]"));
+}
+
+#define RETURN_ON_CUDA_ERROR(expr)                                             \
+  do {                                                                         \
+    if (auto status = (expr)) {                                                \
+      emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
+      return {};                                                               \
+    }                                                                          \
+  } while (false)
+
+namespace {
+struct InitTarget {
+  InitTarget() {
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+  }
+};
+
+class SerializeToCubin : public ModuleToObject {
+public:
+  SerializeToCubin(Operation &module, NVPTXTargetAttr target,
+                   TargetOptions targetOptions = {});
+
+  // Init the target.
+  static void init();
+
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override;
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule,
+                 llvm::TargetMachine &targetMachine) override;
+
+private:
+  StringRef toolkitPath;
+  SmallVector<std::string> fileList;
+};
+} // namespace
+
+SerializeToCubin::SerializeToCubin(Operation &module, NVPTXTargetAttr target,
+                                   TargetOptions targetOptions)
+    : ModuleToObject(module, target.getTriple(), target.getChip(),
+                     target.getFeatures(), target.getO()),
+      toolkitPath(targetOptions.getToolkitPath()),
+      fileList(targetOptions.getBitcodeFiles()) {
+  if (toolkitPath.empty())
+    toolkitPath = __DEFAULT_CUDATOOLKIT_PATH__;
+
+  if (ArrayAttr files = target.getLink())
+    for (Attribute attr : files.getValue())
+      if (auto file = dyn_cast<StringAttr>(attr))
+        fileList.push_back(file.str());
+}
+
+void SerializeToCubin::init() { static InitTarget target = InitTarget(); }
+
+std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+SerializeToCubin::loadBitcodeFiles(llvm::LLVMContext &context,
+                                   llvm::Module &module) {
+  // Try loading `libdevice` from a CUDA toolkit installation.
+  StringRef pathRef = toolkitPath;
+  if (pathRef.size()) {
+    SmallVector<char, 256> path;
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_directory(pathRef)) {
+      getOperation().emitError() << "CUDA path: " << pathRef
+                                 << " does not exist or is not a directory.\n";
+      return std::nullopt;
+    }
+    // TODO remove this hard coded path.
+    llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc");
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitError() << "LibDevice path: " << pathRef
+                                 << " does not exist or is not a file.\n";
+      return std::nullopt;
+    }
+    fileList.push_back(pathRef.str());
+  }
+
+  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true)))
+    return std::nullopt;
+  return bcFiles;
+}
+
+std::optional<SmallVector<char, 0>>
+SerializeToCubin::moduleToObject(llvm::Module &llvmModule,
+                                 llvm::TargetMachine &targetMachine) {
+  std::optional<std::string> serializedISA =
+      translateToISA(llvmModule, targetMachine);
+  if (!serializedISA) {
+    getOperation().emitError() << "Failed translating the module to ISA.";
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "ISA for module: "
+                 << dyn_cast<GPUModuleOp>(&getOperation()).getNameAttr()
+                 << "\n";
+    llvm::dbgs() << *serializedISA << "\n";
+    llvm::dbgs().flush();
+  });
+
+  auto loc = getOperation().getLoc();
+  char jitErrorBuffer[4096] = {0};
+
+  RETURN_ON_CUDA_ERROR(cuInit(0));
+
+  // Linking requires a device context.
+  CUdevice device;
+  RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
+  CUcontext context;
+  RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
+  CUlinkState linkState;
+
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
+
+  RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
+                                    jitOptions,     /* jit options */
+                                    jitOptionsVals, /* jit option values */
+                                    &linkState));
+
+  auto kernelName = dyn_cast<gpu::GPUModuleOp>(getOperation()).getName().str();
+  RETURN_ON_CUDA_ERROR(cuLinkAddData(
+      linkState, CUjitInputType::CU_JIT_INPUT_PTX,
+      const_cast<void *>(static_cast<const void *>(serializedISA->c_str())),
+      serializedISA->length(), kernelName.c_str(),
+      0,       /* number of jit options */
+      nullptr, /* jit options */
+      nullptr  /* jit option values */
+      ));
+
+  void *cubinData;
+  size_t cubinSize;
+  RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
+
+  char *cubinAsChar = static_cast<char *>(cubinData);
+  auto result = SmallVector<char, 0>(cubinAsChar, cubinAsChar + cubinSize);
+
+  // This will also destroy the cubin data.
+  RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
+  RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
+  return result;
+}
+
+std::optional<SmallVector<char, 0>>
+NVPTXTargetAttr::serializeToObject(Operation *module,
+                                   const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  SerializeToCubin::init();
+  SerializeToCubin serializer(*module, *this, options);
+  return serializer.run();
+}
+
+#else
+// Provide a null vector for testing purposes.
+std::optional<SmallVector<char, 0>>
+NVPTXTargetAttr::serializeToObject(Operation *module,
+                                   const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  return SmallVector<char, 0>{};
+}
+#endif // MLIR_GPU_NVPTX_TARGET_ENABLED
+
+LogicalResult
+NVPTXTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                        int optLevel, StringRef triple, StringRef chip,
+                        StringRef features, DictionaryAttr flags,
+                        ArrayAttr files) {
+  if (optLevel < 0 || optLevel > 3) {
+    emitError() << "The optimization level must be a number between 0 and 3.";
+    return failure();
+  }
+  if (triple.empty()) {
+    emitError() << "The target triple cannot be empty.";
+    return failure();
+  }
+  if (chip.empty()) {
+    emitError() << "The target chip cannot be empty.";
+    return failure();
+  }
+  if (files && llvm::all_of(files, [](::mlir::Attribute attr) {
+        return attr && mlir::isa<StringAttr>(attr);
+      })) {
+    emitError() << "All the elements in the `link` array must be strings.";
+    return failure();
+  }
+  return success();
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -19,6 +19,7 @@
 # libMLIR.so.
 add_mlir_library(MLIRExecutionEngineUtils
   OptUtils.cpp
+  ModuleToObject.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/ExecutionEngine
diff --git a/mlir/lib/ExecutionEngine/ModuleToObject.cpp b/mlir/lib/ExecutionEngine/ModuleToObject.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/ModuleToObject.cpp
@@ -0,0 +1,221 @@
+//===- ModuleToObject.cpp - Module to object base class ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the base class for transforming Operations into binary
+// objects.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/ModuleToObject.h"
+
+#include "mlir/ExecutionEngine/OptUtils.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+#include "mlir/Target/LLVMIR/ModuleTranslation.h"
+
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/TargetParser.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+
+using namespace mlir;
+
+ModuleToObject::ModuleToObject(Operation &module, StringRef triple,
+                               StringRef chip, StringRef features, int optLevel)
+    : module(module), triple(triple), chip(chip), features(features),
+      optLevel(optLevel) {}
+
+Operation &ModuleToObject::getOperation() { return module; }
+
+std::unique_ptr<llvm::TargetMachine> ModuleToObject::createTargetMachine() {
+  std::string error;
+  // Load the target.
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple, error);
+  if (!target) {
+    getOperation().emitError() << "Failed to lookup target: " << error;
+    return {};
+  }
+
+  // Create the target machine using the target.
+  llvm::TargetMachine *machine =
+      target->createTargetMachine(triple, chip, features, {}, {});
+  if (!machine) {
+    getOperation().emitError() << "Failed to create the target machine.";
+    return {};
+  }
+  return std::unique_ptr<llvm::TargetMachine>{machine};
+}
+
+std::unique_ptr<llvm::Module>
+ModuleToObject::loadBitcodeFile(llvm::LLVMContext &context, StringRef path) {
+  llvm::SMDiagnostic error;
+  std::unique_ptr<llvm::Module> library =
+      llvm::getLazyIRFileModule(path, error, context);
+  if (!library) {
+    getOperation().emitError() << "Failed loading file from " << path
+                               << ", error: " << error.getMessage();
+    return nullptr;
+  }
+  return library;
+}
+
+LogicalResult ModuleToObject::loadBitcodeFilesFromList(
+    llvm::LLVMContext &context, ArrayRef<std::string> fileList,
+    SmallVector<std::unique_ptr<llvm::Module>> &llvmModules,
+    bool failureOnError) {
+  for (const std::string &str : fileList) {
+    // Test if the path exists, if it doesn't abort.
+    StringRef pathRef = StringRef(str.data(), str.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitError()
+          << "File path: " << pathRef << " does not exist or is not a file.\n";
+      return failure();
+    }
+    // Load the file or abort on error.
+    if (auto bcFile = loadBitcodeFile(context, pathRef))
+      llvmModules.push_back(std::move(bcFile));
+    else if (failureOnError)
+      return failure();
+  }
+  return success();
+}
+
+std::unique_ptr<llvm::Module>
+ModuleToObject::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
+  return translateModuleToLLVMIR(&getOperation(), llvmContext);
+}
+
+LogicalResult
+ModuleToObject::linkFiles(llvm::Module &module,
+                          SmallVector<std::unique_ptr<llvm::Module>> &&libs) {
+  if (libs.empty())
+    return success();
+  llvm::Linker linker(module);
+  for (std::unique_ptr<llvm::Module> &libModule : libs) {
+    // This bitcode linking imports the library functions into the module,
+    // allowing LLVM optimization passes (which must run after linking) to
+    // optimize across the libraries and the module's code. We also only import
+    // symbols if they are referenced by the module or a previous library since
+    // there will be no other source of references to those symbols in this
+    // compilation and since we don't want to bloat the resulting code object.
+    bool err = linker.linkInModule(
+        std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded,
+        [](llvm::Module &m, const StringSet<> &gvs) {
+          llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) {
+            return !gv.hasName() || (gvs.count(gv.getName()) == 0);
+          });
+        });
+    // True is linker failure
+    if (err) {
+      getOperation().emitError("Unrecoverable failure during bitcode linking.");
+      // We have no guaranties about the state of `ret`, so bail
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult ModuleToObject::optimizeModule(llvm::Module &module,
+                                             llvm::TargetMachine &targetMachine,
+                                             int optLevel) {
+  if (optLevel < 0 || optLevel > 3)
+    return getOperation().emitError()
+           << "Invalid optimization level: " << optLevel << ".";
+
+  targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
+
+  auto transformer =
+      makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
+  auto error = transformer(&module);
+  if (error) {
+    InFlightDiagnostic mlirError = getOperation().emitError();
+    llvm::handleAllErrors(
+        std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
+          mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
+        });
+    return mlirError;
+  }
+  return success();
+}
+
+std::optional<std::string>
+ModuleToObject::translateToISA(llvm::Module &llvmModule,
+                               llvm::TargetMachine &targetMachine) {
+  std::string targetISA;
+  llvm::raw_string_ostream stream(targetISA);
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegenPasses;
+
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CGFT_AssemblyFile))
+      return std::nullopt;
+
+    codegenPasses.run(llvmModule);
+  }
+  return stream.str();
+}
+
+std::optional<SmallVector<char, 0>>
+ModuleToObject::moduleToObject(llvm::Module &llvmModule,
+                               llvm::TargetMachine &targetMachine) {
+  SmallVector<char, 0> binaryData;
+  // Write the LLVM module bitcode to a buffer.
+  llvm::raw_svector_ostream outputStream(binaryData);
+  llvm::WriteBitcodeToFile(llvmModule, outputStream);
+  return binaryData;
+}
+
+std::optional<SmallVector<char, 0>> ModuleToObject::run() {
+  // Translate the module to LLVM IR.
+  llvm::LLVMContext llvmContext;
+  std::unique_ptr<llvm::Module> llvmModule = translateToLLVMIR(llvmContext);
+  if (!llvmModule) {
+    getOperation().emitError() << "Failed creating the llvm::Module.";
+    return std::nullopt;
+  }
+
+  // Create the target machine.
+  std::unique_ptr<llvm::TargetMachine> targetMachine = createTargetMachine();
+  if (!targetMachine)
+    return std::nullopt;
+
+  // Set the data layout and target triple of the module.
+  llvmModule->setDataLayout(targetMachine->createDataLayout());
+  llvmModule->setTargetTriple(targetMachine->getTargetTriple().getTriple());
+
+  // Link bitcode files.
+  handleModulePreLink(*llvmModule, *targetMachine);
+  {
+    auto libs = loadBitcodeFiles(llvmContext, *llvmModule);
+    if (!libs)
+      return std::nullopt;
+    if (libs->size())
+      if (failed(linkFiles(*llvmModule, std::move(*libs))))
+        return std::nullopt;
+    handleModulePostLink(*llvmModule, *targetMachine);
+  }
+
+  // Optimize the module.
+  if (failed(optimizeModule(*llvmModule, *targetMachine, optLevel)))
+    return std::nullopt;
+
+  // Return the serialized object.
+  return moduleToObject(*llvmModule, *targetMachine);
+}
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -694,7 +694,8 @@
 }
 
 /// Create named global variables that correspond to llvm.mlir.global
-/// definitions. Convert llvm.global_ctors and global_dtors ops.
+/// definitions. Convert llvm.global_ctors and global_dtors ops. Finally convert
+/// operations with the `GlobalSymbol` trait.
 LogicalResult ModuleTranslation::convertGlobals() {
   for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
     llvm::Type *type = convertType(op.getType());
@@ -797,6 +798,18 @@
     if (failed(convertDialectAttributes(op)))
       return failure();
 
+  // Convert operations having the `GlobalSymbol` trait.
+  {
+    llvm::IRBuilder<> llvmBuilder(llvmModule->getContext());
+    for (Operation &op : getModuleBody(mlirModule).getOperations()) {
+      if (!isa<LLVM::GlobalOp, LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp>(&op) &&
+          op.hasTrait<OpTrait::GlobalSymbol>() &&
+          failed(convertOperation(op, llvmBuilder))) {
+        return failure();
+      }
+    }
+  }
+
   return success();
 }
 
@@ -1427,6 +1440,7 @@
     if (!isa<LLVM::LLVMFuncOp, LLVM::GlobalOp, LLVM::GlobalCtorsOp,
              LLVM::GlobalDtorsOp, LLVM::MetadataOp, LLVM::ComdatOp>(&o) &&
         !o.hasTrait<OpTrait::IsTerminator>() &&
+        !o.hasTrait<OpTrait::GlobalSymbol>() &&
         failed(translator.convertOperation(o, llvmBuilder))) {
       return nullptr;
     }
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -77,7 +77,7 @@
 
 module attributes {gpu.container_module} {
   func.func @launch_func_undefined_module(%sz : index) {
-    // expected-error@+1 {{kernel module 'kernels' is undefined}}
+    // expected-error@+1 {{kernel container 'kernels' is undefined}}
     gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz)
     return
   }
@@ -610,3 +610,19 @@
     }
   }
 }
+
+// -----
+
+module {
+  // expected-error @+1 {{'gpu.module' op attribute 'targets' failed to satisfy constraint: Array of GPU target attributes with at least 1 elements}}
+  gpu.module @gpu_funcs [] {
+  }
+}
+
+// -----
+
+module {
+  // expected-error @+1 {{'gpu.module' op attribute 'targets' failed to satisfy constraint: Array of GPU target attributes with at least 1 elements}}
+  gpu.module @gpu_funcs [1] {
+  }
+}