diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -54,14 +54,23 @@
 protected:
   void getDependentDialects(DialectRegistry &registry) const override;
 
-private:
-  /// Creates the LLVM target machine to generate the ISA.
-  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+  /// Translates the module to ISA
+  virtual Optional<std::string>
+  translateToISA(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine);
+
+  /// Hook allowing the application of optimizations before codegen
+  /// By default, does nothing
+  virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                                     llvm::TargetMachine &targetMachine);
 
   /// Translates the 'getOperation()' result to an LLVM module.
   virtual std::unique_ptr<llvm::Module>
   translateToLLVMIR(llvm::LLVMContext &llvmContext);
 
+private:
+  /// Creates the LLVM target machine to generate the ISA.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
   /// Serializes the target ISA to binary form.
   virtual std::unique_ptr<std::vector<char>>
   serializeISA(const std::string &isa) = 0;
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -8,11 +8,14 @@
 
 if (MLIR_ENABLE_ROCM_CONVERSIONS)
   set(AMDGPU_LIBS
+    IRReader
+    linker
     MCParser
     AMDGPUAsmParser
     AMDGPUCodeGen
     AMDGPUDesc
     AMDGPUInfo
+    target
   )
 endif()
 
@@ -127,40 +130,22 @@
     message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS")
   endif()
 
-  # Configure ROCm support.
-  if (NOT DEFINED ROCM_PATH)
-    if (NOT DEFINED ENV{ROCM_PATH})
-      set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
-    else()
-      set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed")
-    endif()
-    set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed")
-  endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-  find_package(HIP)
-  if (NOT HIP_FOUND)
-    message(SEND_ERROR "Building mlir with ROCm support requires a working ROCm and HIP install")
-  else()
-    message(STATUS "ROCm HIP version: ${HIP_VERSION}")
-  endif()
-
+  set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
   target_compile_definitions(obj.MLIRGPUOps
     PRIVATE
-    __HIP_PLATFORM_HCC__
-    __ROCM_PATH__="${ROCM_PATH}"
+    __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
     MLIR_GPU_TO_HSACO_PASS_ENABLE=1
   )
 
   target_include_directories(obj.MLIRGPUOps
     PRIVATE
     ${MLIR_SOURCE_DIR}/../lld/include
-    ${HIP_PATH}/include
-    ${ROCM_PATH}/include
   )
 
   target_link_libraries(MLIRGPUOps
     PRIVATE
     lldELF
+    MLIRExecutionEngine
     MLIRROCDLToLLVMIRTranslation
   )
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
@@ -31,18 +31,28 @@
 gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
     : OperationPass<gpu::GPUModuleOp>(other) {}
 
-static std::string translateToISA(llvm::Module &llvmModule,
-                                  llvm::TargetMachine &targetMachine) {
+Optional<std::string>
+gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule,
+                                         llvm::TargetMachine &targetMachine) {
   llvmModule.setDataLayout(targetMachine.createDataLayout());
 
+  if (failed(optimizeLlvm(llvmModule, targetMachine))) {
+    return llvm::None;
+  }
   std::string targetISA;
   llvm::raw_string_ostream stream(targetISA);
-  llvm::buffer_ostream pstream(stream);
+
   llvm::legacy::PassManager codegenPasses;
-  targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                    llvm::CGFT_AssemblyFile);
-  codegenPasses.run(llvmModule);
-  return targetISA;
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CGFT_AssemblyFile)) {
+      return llvm::None;
+    }
+    codegenPasses.run(llvmModule);
+  }
+  return stream.str();
 }
 
 void gpu::SerializeToBlobPass::runOnOperation() {
@@ -58,7 +68,13 @@
   if (!targetMachine)
     return signalPassFailure();
 
-  std::string targetISA = translateToISA(*llvmModule, *targetMachine);
+  Optional<std::string> maybeTargetISA =
+      translateToISA(*llvmModule, *targetMachine);
+
+  if (!maybeTargetISA.hasValue()) {
+    return signalPassFailure();
+  }
+  std::string targetISA = maybeTargetISA.getValue();
 
   // Serialize the target ISA.
   std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
@@ -71,6 +87,14 @@
   getOperation()->setAttr(gpuBinaryAnnotation, attr);
 }
 
+LogicalResult
+gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  // TODO: If serializeToCubin ends up defining optimizations, factor them
+  // into here from SerializeToHsaco
+  return success();
+}
+
 void gpu::SerializeToBlobPass::getDependentDialects(
     DialectRegistry &registry) const {
   registerLLVMDialectTranslation(registry);
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -13,11 +13,17 @@
 #include "mlir/Dialect/GPU/Passes.h"
 
 #if MLIR_GPU_TO_HSACO_PASS_ENABLE
+#include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -27,19 +33,19 @@
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-
 #include "llvm/MC/TargetRegistry.h"
+
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileUtilities.h"
-#include "llvm/Support/LineIterator.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
+
 #include "llvm/Target/TargetOptions.h"
 
 #include "lld/Common/Driver.h"
 
-#include "hip/hip_version.h"
-
 #include <mutex>
 
 using namespace mlir;
@@ -48,13 +54,36 @@
 class SerializeToHsacoPass
     : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
 public:
+  // Needed to make options work
   SerializeToHsacoPass();
+  SerializeToHsacoPass(const SerializeToHsacoPass &other) {
+    if (other.triple.hasValue()) {
+      this->triple = other.triple;
+    }
+    if (other.chip.hasValue()) {
+      this->chip = other.chip;
+    }
+    if (other.features.hasValue()) {
+      this->features = other.features;
+    }
+    if (other.rocmPath.hasValue()) {
+      this->rocmPath = other.rocmPath;
+    }
+    this->optLevel = other.optLevel;
+  };
+
+  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
+                       int optLevel);
 
   StringRef getArgument() const override { return "gpu-to-hsaco"; }
   StringRef getDescription() const override {
     return "Lower GPU kernel function to HSACO binary annotations";
   }
 
+protected:
+  Option<std::string> rocmPath{*this, "rocm-path",
+                               llvm::cl::desc("Path to ROCm install")};
+
 private:
   void getDependentDialects(DialectRegistry &registry) const override;
 
@@ -62,67 +91,35 @@
   std::unique_ptr<std::vector<char>>
   serializeISA(const std::string &isa) override;
 
+  // Overload to allow linking in device libs
+  std::unique_ptr<llvm::Module>
+  translateToLLVMIR(llvm::LLVMContext &llvmContext) override;
+
+  /// Adds LLVM optimization passes
+  LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                             llvm::TargetMachine &targetMachine) override;
+
   std::unique_ptr<SmallVectorImpl<char>> assembleIsa(const std::string &isa);
   std::unique_ptr<std::vector<char>>
   createHsaco(const SmallVectorImpl<char> &isaBinary);
-};
-} // namespace
-
-static std::string getDefaultChip() {
-  const char kDefaultChip[] = "gfx900";
 
-  // Locate rocm_agent_enumerator.
-  const char kRocmAgentEnumerator[] = "rocm_agent_enumerator";
-  llvm::ErrorOr<std::string> rocmAgentEnumerator = llvm::sys::findProgramByName(
-      kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"});
-  if (!rocmAgentEnumerator) {
-    llvm::WithColor::warning(llvm::errs())
-        << kRocmAgentEnumerator << "couldn't be located under " << __ROCM_PATH__
-        << "/bin\n";
-    return kDefaultChip;
-  }
+  std::string getRocmPath();
 
-  // Prepare temp file to hold the outputs.
-  int tempFd = -1;
-  SmallString<128> tempFilename;
-  if (llvm::sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd,
-                                         tempFilename)) {
-    llvm::WithColor::warning(llvm::errs())
-        << "temporary file for " << kRocmAgentEnumerator << " creation error\n";
-    return kDefaultChip;
-  }
-  llvm::FileRemover cleanup(tempFilename);
-
-  // Invoke rocm_agent_enumerator.
-  std::string errorMessage;
-  SmallVector<StringRef, 2> args{"-t", "GPU"};
-  Optional<StringRef> redirects[3] = {{""}, tempFilename.str(), {""}};
-  int result =
-      llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None,
-                                redirects, 0, 0, &errorMessage);
-  if (result) {
-    llvm::WithColor::warning(llvm::errs())
-        << kRocmAgentEnumerator << " invocation error: " << errorMessage
-        << "\n";
-    return kDefaultChip;
-  }
-
-  // Load and parse the result.
-  auto gfxIsaList = openInputFile(tempFilename);
-  if (!gfxIsaList) {
-    llvm::WithColor::error(llvm::errs())
-        << "read ROCm agent list temp file error\n";
-    return kDefaultChip;
+  int optLevel;
+};
+} // end namespace
+
+/// Get a user-specified path to ROCm
+// Tries, in order, the --rocm-path option, the ROCM_PATH environment variable
+// and a compile-time default
+std::string SerializeToHsacoPass::getRocmPath() {
+  if (rocmPath.getNumOccurrences() > 0) {
+    return rocmPath.getValue();
   }
-  for (llvm::line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) {
-    // Skip the line with content "gfx000".
-    if (*lines == "gfx000")
-      continue;
-    // Use the first ISA version found.
-    return lines->str();
+  if (auto env = llvm::sys::Process::GetEnv("ROCM_PATH")) {
+    return env.getValue();
   }
-
-  return kDefaultChip;
+  return __DEFAULT_ROCM_PATH__;
 }
 
 // Sets the 'option' to 'value' unless it already has a value.
@@ -132,12 +129,12 @@
     option = getValue();
 }
 
-SerializeToHsacoPass::SerializeToHsacoPass() {
-  maybeSetOption(this->triple, [] { return "amdgcn-amd-amdhsa"; });
-  maybeSetOption(this->chip, [] {
-    static auto chip = getDefaultChip();
-    return chip;
-  });
+SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
+                                           StringRef features, int optLevel)
+    : optLevel(optLevel) {
+  maybeSetOption(this->triple, [&triple] { return triple.str(); });
+  maybeSetOption(this->chip, [&arch] { return arch.str(); });
+  maybeSetOption(this->features, [&features] { return features.str(); });
 }
 
 void SerializeToHsacoPass::getDependentDialects(
@@ -146,6 +143,208 @@
   gpu::SerializeToBlobPass::getDependentDialects(registry);
 }
 
+static Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>>
+loadLibraries(SmallVectorImpl<char> &path,
+              SmallVectorImpl<StringRef> &libraries,
+              llvm::LLVMContext &context) {
+  SmallVector<std::unique_ptr<llvm::Module>, 3> ret;
+  auto dirLength = path.size();
+
+  if (!llvm::sys::fs::is_directory(path)) {
+    llvm::dbgs() << "Bitcode path: " << path
+                 << " does not exist or is not a directory\n";
+    return llvm::None;
+  }
+
+  for (const auto &file : libraries) {
+    llvm::SMDiagnostic error;
+    llvm::sys::path::append(path, file);
+    llvm::StringRef pathRef(path.data(), path.size());
+    std::unique_ptr<llvm::Module> library =
+        llvm::getLazyIRFileModule(pathRef, error, context);
+    path.set_size(dirLength);
+    if (!library) {
+      llvm::dbgs() << "Failed to load library " << file << " from " << path;
+      error.print("[MLIR backend]", llvm::dbgs());
+      return llvm::None;
+    }
+    // Some ROCM builds don't strip this like they should
+    if (auto *openclVersion = library->getNamedMetadata("opencl.ocl.version")) {
+      library->eraseNamedMetadata(openclVersion);
+    }
+    // Stop spamming us with clang version numbers
+    if (auto *ident = library->getNamedMetadata("llvm.ident")) {
+      library->eraseNamedMetadata(ident);
+    }
+    ret.push_back(std::move(library));
+  }
+
+  return ret;
+}
+
+std::unique_ptr<llvm::Module>
+SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
+  // MLIR -> LLVM translation
+  std::unique_ptr<llvm::Module> ret =
+      gpu::SerializeToBlobPass::translateToLLVMIR(llvmContext);
+
+  if (!ret) {
+    llvm::dbgs() << "Module creation failed";
+    return ret;
+  }
+  // Walk the LLVM module in order to determine if we need to link in device
+  // libs
+  bool needOpenCl = false;
+  bool needOckl = false;
+  bool needOcml = false;
+  for (auto &f : ret->functions()) {
+    if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
+      StringRef funcName = f.getName();
+      if ("printf" == funcName) {
+        needOpenCl = true;
+      }
+      if (funcName.startswith("__ockl_")) {
+        needOckl = true;
+      }
+      if (funcName.startswith("__ocml_")) {
+        needOcml = true;
+      }
+    }
+  }
+
+  if (needOpenCl) {
+    needOcml = needOckl = true;
+  }
+
+  // No libraries needed (the typical case)
+  if (!(needOpenCl || needOcml || needOckl)) {
+    return ret;
+  }
+
+  auto addControlConstant = [&module = *ret](StringRef name, uint32_t value,
+                                             uint32_t bitwidth) {
+    using llvm::GlobalVariable;
+    if (module.getNamedGlobal(name)) {
+      return;
+    }
+    llvm::IntegerType *type =
+        llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
+    auto *initializer = llvm::ConstantInt::get(type, value, /*isSigned=*/false);
+    auto *constant = new GlobalVariable(
+        module, type,
+        /*isConstant=*/true, GlobalVariable::LinkageTypes::LinkOnceODRLinkage,
+        initializer, name,
+        /*before=*/nullptr,
+        /*threadLocalMode=*/GlobalVariable::ThreadLocalMode::NotThreadLocal,
+        /*addressSpace=*/4);
+    constant->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local);
+    constant->setVisibility(
+        GlobalVariable::VisibilityTypes::ProtectedVisibility);
+    constant->setAlignment(llvm::MaybeAlign(bitwidth / 8));
+  };
+
+  // Set up control variables in the module instead of linking in tiny bitcode
+  if (needOcml) {
+    // TODO(kdrewnia): Enable math optimizations once we have support for
+    // `-ffast-math`-like options
+    addControlConstant("__oclc_finite_only_opt", 0, 8);
+    addControlConstant("__oclc_daz_opt", 0, 8);
+    addControlConstant("__oclc_correctly_rounded_sqrt32", 1, 8);
+    addControlConstant("__oclc_unsafe_math_opt", 0, 8);
+  }
+  if (needOcml || needOckl) {
+    addControlConstant("__oclc_wavefrontsize64", 1, 8);
+    StringRef chipSet = this->chip.getValue();
+    if (chipSet.startswith("gfx")) {
+      chipSet = chipSet.substr(3);
+    }
+    uint32_t minor =
+        llvm::APInt(32, chipSet.substr(chipSet.size() - 2), 16).getZExtValue();
+    uint32_t major = llvm::APInt(32, chipSet.substr(0, chipSet.size() - 2), 10)
+                         .getZExtValue();
+    uint32_t isaNumber = minor + 1000 * major;
+    addControlConstant("__oclc_ISA_version", isaNumber, 32);
+  }
+
+  // Determine libraries we need to link
+  llvm::SmallVector<StringRef, 4> libraries;
+  if (needOpenCl) {
+    libraries.push_back("opencl.bc");
+  }
+  if (needOcml) {
+    libraries.push_back("ocml.bc");
+  }
+  if (needOckl) {
+    libraries.push_back("ockl.bc");
+  }
+
+  Optional<SmallVector<std::unique_ptr<llvm::Module>, 3>> mbModules;
+  auto theRocmPath = getRocmPath();
+  llvm::SmallString<32> bitcodePath(theRocmPath);
+  llvm::sys::path::append(bitcodePath, "amdgcn", "bitcode");
+  mbModules = loadLibraries(bitcodePath, libraries, llvmContext);
+
+  // Handle legacy override variable
+  auto env = llvm::sys::Process::GetEnv("HIP_DEVICE_LIB_PATH");
+  if (env && (rocmPath.getNumOccurrences() == 0)) {
+    llvm::SmallString<32> overrideValue(env.getValue());
+    auto mbAtOldPath = loadLibraries(overrideValue, libraries, llvmContext);
+    if (mbAtOldPath) {
+      mbModules = std::move(mbAtOldPath);
+    }
+  }
+
+  if (!mbModules) {
+    llvm::WithColor::warning(llvm::errs())
+        << "Warning: Could not load required device labraries\n";
+    llvm::WithColor::note(llvm::errs())
+        << "Note: this will probably cause link-time or run-time failures\n";
+    return ret; // We can still abort here
+  }
+
+  llvm::Linker linker(*ret);
+  for (auto &libModule : mbModules.getValue()) {
+    // Failure is true
+    auto err = linker.linkInModule(
+        std::move(libModule), llvm::Linker::Flags::LinkOnlyNeeded,
+        [](llvm::Module &m, const StringSet<> &gvs) {
+          llvm::internalizeModule(m, [&gvs](const llvm::GlobalValue &gv) {
+            return !gv.hasName() || (gvs.count(gv.getName()) == 0);
+          });
+        });
+    if (err) {
+      llvm::errs() << "Error: Failure in library bitcode linking\n";
+      // We have no guaranties about the state of `ret`, so bail
+      return nullptr;
+    }
+  }
+  return ret;
+}
+
+LogicalResult
+SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule,
+                                   llvm::TargetMachine &targetMachine) {
+  if (optLevel < 0 || optLevel > 3) {
+    llvm::errs() << "Invalid optimization level passed to SerializeToHsaco: "
+                 << optLevel << "\n";
+    return failure();
+  }
+  targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
+
+  auto transformer =
+      makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
+  auto error = transformer(&llvmModule);
+  if (error) {
+    llvm::handleAllErrors(std::move(error), [](const llvm::ErrorInfoBase &ei) {
+      llvm::errs() << "Could not optimize LLVM IR: ";
+      ei.log(llvm::errs());
+      llvm::errs() << "\n";
+    });
+    return failure();
+  }
+  return success();
+}
+
 std::unique_ptr<SmallVectorImpl<char>>
 SerializeToHsacoPass::assembleIsa(const std::string &isa) {
   auto loc = getOperation().getLoc();
@@ -281,7 +480,8 @@
         LLVMInitializeAMDGPUTargetInfo();
         LLVMInitializeAMDGPUTargetMC();
 
-        return std::make_unique<SerializeToHsacoPass>();
+        return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "",
+                                                      "", 2);
       });
 }
 #else  // MLIR_GPU_TO_HSACO_PASS_ENABLE
diff --git a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
--- a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
+++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN:   -gpu-kernel-outlining \
-// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco{chip=%chip})' \
 // RUN:   -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
diff --git a/mlir/test/Integration/GPU/ROCM/lit.local.cfg b/mlir/test/Integration/GPU/ROCM/lit.local.cfg
--- a/mlir/test/Integration/GPU/ROCM/lit.local.cfg
+++ b/mlir/test/Integration/GPU/ROCM/lit.local.cfg
@@ -1,2 +1,23 @@
+import subprocess
+
 if not config.enable_rocm_runner:
   config.unsupported = True
+
+# Need to specify the chip sub-option to the gpu-to-hsaco pass, and also check
+# that we have a gpu at all.  Use rocm_agent_enumerator to find the chip and
+# use %chip to insert it, and if no chip is found, mark the test unsupported.
+# Even though the tests here use mlir-cpu-runner, they still call mgpu
+# functions.
+config.chip = 'gfx000'
+if config.rocm_path:
+   try:
+       p = subprocess.run([config.rocm_path + "/bin/rocm_agent_enumerator"],
+                          check=True, stdout=subprocess.PIPE)
+       agents = [x for x in p.stdout.split() if x != b'gfx000']
+       if agents:
+           config.chip = agents[0].decode('utf-8')
+       else:
+           config.unsupported = True
+   except subprocess.CalledProcessError:
+       config.unsupported = True
+config.substitutions.append(('%chip', config.chip))
diff --git a/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
--- a/mlir/test/Integration/GPU/ROCM/two-modules.mlir
+++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
 // RUN:   -gpu-kernel-outlining \
-// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco{chip=%chip})' \
 // RUN:   -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
diff --git a/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
--- a/mlir/test/Integration/GPU/ROCM/vecadd.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -convert-scf-to-std \
 // RUN:   -gpu-kernel-outlining \
-// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco{chip=%chip})' \
 // RUN:   -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
diff --git a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
--- a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s \
 // RUN:   -convert-scf-to-std \
 // RUN:   -gpu-kernel-outlining \
-// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \
+// RUN:   -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco{chip=%chip})' \
 // RUN:   -gpu-to-llvm \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -41,6 +41,7 @@
 config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
+config.rocm_path = "@ROCM_PATH@"
 config.spirv_wrapper_library_dir = "@MLIR_SPIRV_WRAPPER_LIBRARY_DIR@"
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.vulkan_wrapper_library_dir = "@MLIR_VULKAN_WRAPPER_LIBRARY_DIR@"