diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUCompilationAttr.td
@@ -0,0 +1,91 @@
+//===-- GPUTargetAttr.td - GPU compilation attributes ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the GPU NVPTX target attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_COMPILATIONATTR
+#define GPU_COMPILATIONATTR
+
+include "mlir/Dialect/GPU/IR/GPUBase.td"
+include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// GPU NVPTX target attribute.
+//===----------------------------------------------------------------------===//
+
+def GPU_NVPTXTargetAttr : GPU_Attr<"NVPTXTarget", "nvptx", [
+    DeclareAttrInterfaceMethods<GPUTargetAttrInterface, [
+      "serializeToObject"
+    ]>
+  ]> {
+  let description = [{
+    NVPTX target attribute for controlling compilation of NVIDIA targets. All
+    parameters decay into default values if not present.
+
+    Examples:
+
+    1. Target with default values.
+    ```
+      gpu.module @mymodule [#gpu.nvptx] attributes {...} {
+        ...
+      }
+    ```
+
+    2. Target with `sm_90` chip and fast math.
+    ```
+      gpu.module @mymodule [#gpu.nvptx<chip = "sm_90", flags = {fast}>] {
+        ...
+      }
+    ```
+  }];
+  let parameters = (ins
+    DefaultValuedParameter<"int", "2", "Optimization level to apply.">:$O,
+    StringRefParameter<"Target triple.", "\"nvptx64-nvidia-cuda\"">:$triple,
+    StringRefParameter<"Target chip.", "\"sm_50\"">:$chip,
+    StringRefParameter<"Target chip features.", "\"+ptx60\"">:$features,
+    OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags,
+    OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link
+  );
+  let assemblyFormat = [{
+    (`<` struct($O, $triple, $chip, $features, $flags)^ `>`)?
+  }];
+  let builders = [
+    AttrBuilder<(ins CArg<"int", "2">:$optLevel,
+                     CArg<"StringRef", "\"nvptx64-nvidia-cuda\"">:$triple,
+                     CArg<"StringRef", "\"sm_50\"">:$chip,
+                     CArg<"StringRef", "\"+ptx60\"">:$features,
+                     CArg<"DictionaryAttr", "nullptr">:$targetFlags,
+                     CArg<"ArrayAttr", "nullptr">:$linkFiles), [{
+      return Base::get($_ctxt, optLevel, triple, chip, features, targetFlags, linkFiles);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+  let genVerifyDecl = 1;
+  let extraClassDeclaration = [{
+    bool hasFlag(StringRef flag) const;
+    bool getFastMath() const;
+    bool getFtz() const;
+  }];
+  let extraClassDefinition = [{
+    bool $cppClass::hasFlag(StringRef flag) const {
+      if (DictionaryAttr flags = getFlags())
+        return flags.get(flag) != nullptr;
+      return false;
+    }
+    bool $cppClass::getFastMath() const {
+      return hasFlag("fast");
+    }
+    bool $cppClass::getFtz() const {
+      return hasFlag("ftz");
+    }
+  }];
+}
+
+#endif // GPU_COMPILATIONATTR
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -16,6 +16,7 @@
 include "mlir/Dialect/DLTI/DLTIBase.td"
 include "mlir/Dialect/GPU/IR/GPUBase.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
+include "mlir/Dialect/GPU/IR/GPUCompilationAttr.td"
 include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
 include "mlir/Dialect/GPU/TransformOps/GPUDeviceMappingAttr.td"
 include "mlir/IR/EnumAttr.td"
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -42,6 +42,31 @@
   MLIRMemRefDialect
   MLIRSideEffectInterfaces
   MLIRSupport
+
+  PRIVATE
+  MLIRGPUTargets
+  )
+
+add_mlir_dialect_library(MLIRGPUTargets
+  Targets/NVPTXTarget.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
+
+  LINK_COMPONENTS
+  Core
+  MC
+  Target
+  ${NVPTX_LIBS}
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRExecutionEngineUtils
+  MLIRSupport
+  MLIRTargetLLVMIRExport
+
+  PRIVATE
+  MLIRGPUDialect
   )
 
 add_mlir_dialect_library(MLIRGPUTransforms
@@ -129,6 +154,35 @@
     ${CUDA_DRIVER_LIBRARY}
   )
 
+  # Find the CUDA toolkit.
+  if (NOT DEFINED CUDAToolkit_ROOT)
+    find_package(CUDAToolkit)
+    get_filename_component(CUDAToolkit_ROOT ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+  endif()
+  message(VERBOSE "MLIR Default CUDA toolkit path: ${CUDAToolkit_ROOT}")
+
+  # Enable the gpu to cubin target.
+  target_compile_definitions(obj.MLIRGPUTargets
+    PRIVATE
+    MLIR_GPU_NVPTX_TARGET_ENABLED=1
+    __DEFAULT_CUDATOOLKIT_PATH__="${CUDAToolkit_ROOT}"
+  )
+  # Enable the gpu to cubin target.
+  target_compile_definitions(obj.MLIRGPUTransforms
+    PRIVATE
+    MLIR_GPU_NVPTX_TARGET_ENABLED=1
+  )
+
+  # Add CUDA headers includes and the libcuda.so library.
+  target_include_directories(obj.MLIRGPUTargets
+    PRIVATE
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+  )
+  target_link_libraries(MLIRGPUTargets
+    PRIVATE
+    ${CUDA_DRIVER_LIBRARY}
+  )
+
 endif()
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
diff --git a/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp b/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Targets/NVPTXTarget.cpp
@@ -0,0 +1,254 @@
+//===- NVPTXTarget.cpp - MLIR GPU Dialect NVPTX target attribute ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This files implements the NVPTX target attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+#ifdef MLIR_GPU_NVPTX_TARGET_ENABLED
+#include "mlir/ExecutionEngine/ModuleToObject.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Export.h"
+
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/TargetSelect.h"
+
+#ifndef __DEFAULT_CUDATOOLKIT_PATH__
+#define __DEFAULT_CUDATOOLKIT_PATH__ ""
+#endif
+
+#define DEBUG_TYPE "serialize-to-object"
+
+#include <cuda.h>
+
+static void emitCudaError(const llvm::Twine &expr, const char *buffer,
+                          CUresult result, Location loc) {
+  const char *error;
+  cuGetErrorString(result, &error);
+  emitError(loc, expr.concat(" failed with error code ")
+                     .concat(llvm::Twine{error})
+                     .concat("[")
+                     .concat(buffer)
+                     .concat("]"));
+}
+
+#define RETURN_ON_CUDA_ERROR(expr)                                             \
+  do {                                                                         \
+    if (auto status = (expr)) {                                                \
+      emitCudaError(#expr, jitErrorBuffer, status, loc);                       \
+      return {};                                                               \
+    }                                                                          \
+  } while (false)
+
+namespace {
+struct InitTarget {
+  InitTarget() {
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+  }
+};
+
+class SerializeToCubin : public ModuleToObject {
+public:
+  SerializeToCubin(Operation &module, NVPTXTargetAttr target,
+                   TargetOptions targetOptions = {});
+
+  // Init the target.
+  static void init();
+
+  std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+  loadBitcodeFiles(llvm::LLVMContext &context, llvm::Module &module) override;
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule,
+                 llvm::TargetMachine &targetMachine) override;
+
+private:
+  StringRef toolkitPath;
+  SmallVector<std::string> fileList;
+};
+} // namespace
+
+SerializeToCubin::SerializeToCubin(Operation &module, NVPTXTargetAttr target,
+                                   TargetOptions targetOptions)
+    : ModuleToObject(module, target.getTriple(), target.getChip(),
+                     target.getFeatures(), target.getO()),
+      toolkitPath(targetOptions.getToolkitPath()),
+      fileList(targetOptions.getBitcodeFiles()) {
+  if (toolkitPath.empty())
+    toolkitPath = __DEFAULT_CUDATOOLKIT_PATH__;
+
+  if (ArrayAttr files = target.getLink())
+    for (Attribute attr : files.getValue())
+      if (auto file = dyn_cast<StringAttr>(attr))
+        fileList.push_back(file.str());
+}
+
+void SerializeToCubin::init() { static InitTarget target = InitTarget(); }
+
+std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
+SerializeToCubin::loadBitcodeFiles(llvm::LLVMContext &context,
+                                   llvm::Module &module) {
+  // Try loading `libdevice` from a CUDA toolkit installation.
+  StringRef pathRef = toolkitPath;
+  if (pathRef.size()) {
+    SmallVector<char, 256> path;
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_directory(pathRef)) {
+      getOperation().emitError() << "CUDA path: " << pathRef
+                                 << " does not exist or is not a directory.\n";
+      return std::nullopt;
+    }
+    // TODO remove this hard coded path.
+    llvm::sys::path::append(path, "nvvm", "libdevice", "libdevice.10.bc");
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitError() << "LibDevice path: " << pathRef
+                                 << " does not exist or is not a file.\n";
+      return std::nullopt;
+    }
+    fileList.push_back(pathRef.str());
+  }
+
+  SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  if (failed(loadBitcodeFilesFromList(context, fileList, bcFiles, true)))
+    return std::nullopt;
+  return bcFiles;
+}
+
+std::optional<SmallVector<char, 0>>
+SerializeToCubin::moduleToObject(llvm::Module &llvmModule,
+                                 llvm::TargetMachine &targetMachine) {
+  std::optional<std::string> serializedISA =
+      translateToISA(llvmModule, targetMachine);
+  if (!serializedISA) {
+    getOperation().emitError() << "Failed translating the module to ISA.";
+    return std::nullopt;
+  }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "ISA for module: "
+                 << dyn_cast<GPUModuleOp>(&getOperation()).getNameAttr()
+                 << "\n";
+    llvm::dbgs() << *serializedISA << "\n";
+    llvm::dbgs().flush();
+  });
+
+  auto loc = getOperation().getLoc();
+  char jitErrorBuffer[4096] = {0};
+
+  RETURN_ON_CUDA_ERROR(cuInit(0));
+
+  // Linking requires a device context.
+  CUdevice device;
+  RETURN_ON_CUDA_ERROR(cuDeviceGet(&device, 0));
+  CUcontext context;
+  RETURN_ON_CUDA_ERROR(cuCtxCreate(&context, 0, device));
+  CUlinkState linkState;
+
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer))};
+
+  RETURN_ON_CUDA_ERROR(cuLinkCreate(2,              /* number of jit options */
+                                    jitOptions,     /* jit options */
+                                    jitOptionsVals, /* jit option values */
+                                    &linkState));
+
+  auto kernelName = dyn_cast<gpu::GPUModuleOp>(getOperation()).getName().str();
+  RETURN_ON_CUDA_ERROR(cuLinkAddData(
+      linkState, CUjitInputType::CU_JIT_INPUT_PTX,
+      const_cast<void *>(static_cast<const void *>(serializedISA->c_str())),
+      serializedISA->length(), kernelName.c_str(),
+      0,       /* number of jit options */
+      nullptr, /* jit options */
+      nullptr  /* jit option values */
+      ));
+
+  void *cubinData;
+  size_t cubinSize;
+  RETURN_ON_CUDA_ERROR(cuLinkComplete(linkState, &cubinData, &cubinSize));
+
+  char *cubinAsChar = static_cast<char *>(cubinData);
+  auto result = SmallVector<char, 0>(cubinAsChar, cubinAsChar + cubinSize);
+
+  // This will also destroy the cubin data.
+  RETURN_ON_CUDA_ERROR(cuLinkDestroy(linkState));
+  RETURN_ON_CUDA_ERROR(cuCtxDestroy(context));
+  return result;
+}
+
+std::optional<SmallVector<char, 0>>
+NVPTXTargetAttr::serializeToObject(Operation *module,
+                                   const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  SerializeToCubin::init();
+  SerializeToCubin serializer(*module, *this, options);
+  return serializer.run();
+}
+
+#else
+// Provide a null vector for testing purposes.
+std::optional<SmallVector<char, 0>>
+NVPTXTargetAttr::serializeToObject(Operation *module,
+                                   const TargetOptions &options) const {
+  assert(module && "The module must be non null.");
+  if (!module)
+    return std::nullopt;
+  if (!mlir::isa<GPUModuleOp>(module)) {
+    module->emitError("Module must be a GPU module.");
+    return std::nullopt;
+  }
+  return SmallVector<char, 0>{};
+}
+#endif // MLIR_GPU_NVPTX_TARGET_ENABLED
+
+LogicalResult
+NVPTXTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                        int optLevel, StringRef triple, StringRef chip,
+                        StringRef features, DictionaryAttr flags,
+                        ArrayAttr files) {
+  if (optLevel < 0 || optLevel > 3) {
+    emitError() << "The optimization level must be a number between 0 and 3.";
+    return failure();
+  }
+  if (triple.empty()) {
+    emitError() << "The target triple cannot be empty.";
+    return failure();
+  }
+  if (chip.empty()) {
+    emitError() << "The target chip cannot be empty.";
+    return failure();
+  }
+  if (files && llvm::all_of(files, [](::mlir::Attribute attr) {
+        return attr && mlir::isa<StringAttr>(attr);
+      })) {
+    emitError() << "All the elements in the `link` array must be strings.";
+    return failure();
+  }
+  return success();
+}