diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h
--- a/mlir/include/mlir/Dialect/GPU/Passes.h
+++ b/mlir/include/mlir/Dialect/GPU/Passes.h
@@ -54,14 +54,23 @@
 protected:
   void getDependentDialects(DialectRegistry &registry) const override;
 
-private:
-  /// Creates the LLVM target machine to generate the ISA.
-  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+  /// Hook allowing the application of optimizations before codegen
+  /// By default, does nothing
+  virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                                     llvm::TargetMachine &targetMachine);
 
   /// Translates the 'getOperation()' result to an LLVM module.
   virtual std::unique_ptr<llvm::Module>
   translateToLLVMIR(llvm::LLVMContext &llvmContext);
 
+private:
+  /// Creates the LLVM target machine to generate the ISA.
+  std::unique_ptr<llvm::TargetMachine> createTargetMachine();
+
+  /// Translates the module to ISA
+  Optional<std::string> translateToISA(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine);
+
   /// Serializes the target ISA to binary form.
   virtual std::unique_ptr<std::vector<char>>
   serializeISA(const std::string &isa) = 0;
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -145,14 +145,14 @@
     message(STATUS "ROCm HIP version: ${HIP_VERSION}")
   endif()
 
-  target_compile_definitions(obj.MLIRGPUOps
+  target_compile_definitions(obj.MLIRGPUTransforms
     PRIVATE
     __HIP_PLATFORM_HCC__
     __ROCM_PATH__="${ROCM_PATH}"
     MLIR_GPU_TO_HSACO_PASS_ENABLE=1
   )
 
-  target_include_directories(obj.MLIRGPUOps
+  target_include_directories(obj.MLIRGPUTransforms
     PRIVATE
     ${MLIR_SOURCE_DIR}/../lld/include
     ${HIP_PATH}/include
@@ -162,6 +162,7 @@
   target_link_libraries(MLIRGPUOps
     PRIVATE
     lldELF
+    MLIRExecutionEngine
     MLIRROCDLToLLVMIRTranslation
   )
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp
@@ -31,18 +31,28 @@
 gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other)
     : OperationPass<gpu::GPUModuleOp>(other) {}
 
-static std::string translateToISA(llvm::Module &llvmModule,
-                                  llvm::TargetMachine &targetMachine) {
+Optional<std::string>
+gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule,
+                                         llvm::TargetMachine &targetMachine) {
   llvmModule.setDataLayout(targetMachine.createDataLayout());
 
+  if (failed(optimizeLlvm(llvmModule, targetMachine)))
+    return llvm::None;
+
   std::string targetISA;
   llvm::raw_string_ostream stream(targetISA);
-  llvm::buffer_ostream pstream(stream);
+
   llvm::legacy::PassManager codegenPasses;
-  targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
-                                    llvm::CGFT_AssemblyFile);
-  codegenPasses.run(llvmModule);
-  return targetISA;
+
+  { // Drop pstream after this to prevent the ISA from being stuck buffering
+    llvm::buffer_ostream pstream(stream);
+    if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
+                                          llvm::CGFT_AssemblyFile))
+      return llvm::None;
+
+    codegenPasses.run(llvmModule);
+  }
+  return stream.str();
 }
 
 void gpu::SerializeToBlobPass::runOnOperation() {
@@ -58,7 +68,13 @@
   if (!targetMachine)
     return signalPassFailure();
 
-  std::string targetISA = translateToISA(*llvmModule, *targetMachine);
+  Optional<std::string> maybeTargetISA =
+      translateToISA(*llvmModule, *targetMachine);
+
+  if (!maybeTargetISA.hasValue())
+    return signalPassFailure();
+
+  std::string targetISA = std::move(maybeTargetISA.getValue());
 
   // Serialize the target ISA.
   std::unique_ptr<std::vector<char>> blob = serializeISA(targetISA);
@@ -71,6 +87,14 @@
   getOperation()->setAttr(gpuBinaryAnnotation, attr);
 }
 
+LogicalResult
+gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule,
+                                       llvm::TargetMachine &targetMachine) {
+  // TODO: If serializeToCubin ends up defining optimizations, factor them
+  // into here from SerializeToHsaco
+  return success();
+}
+
 void gpu::SerializeToBlobPass::getDependentDialects(
     DialectRegistry &registry) const {
   registerLLVMDialectTranslation(registry);
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -11,8 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 
 #if MLIR_GPU_TO_HSACO_PASS_ENABLE
+#include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
@@ -32,8 +35,11 @@
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/WithColor.h"
+
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
 #include "lld/Common/Driver.h"
@@ -48,12 +54,24 @@
 class SerializeToHsacoPass
     : public PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass> {
 public:
-  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features);
+  SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features,
+                       int optLevel);
+  SerializeToHsacoPass(const SerializeToHsacoPass &other);
   StringRef getArgument() const override { return "gpu-to-hsaco"; }
   StringRef getDescription() const override {
     return "Lower GPU kernel function to HSACO binary annotations";
   }
 
+protected:
+  Option<int> optLevel{
+      *this, "opt-level",
+      llvm::cl::desc("Optimization level for HSACO compilation"),
+      llvm::cl::init(2)};
+
+  /// Adds LLVM optimization passes
+  LogicalResult optimizeLlvm(llvm::Module &llvmModule,
+                             llvm::TargetMachine &targetMachine) override;
+
 private:
   void getDependentDialects(DialectRegistry &registry) const override;
 
@@ -67,6 +85,8 @@
 };
 } // namespace
 
+SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other)
+    : PassWrapper<SerializeToHsacoPass, gpu::SerializeToBlobPass>(other) {}
 static std::string getDefaultChip() {
   const char kDefaultChip[] = "gfx900";
 
@@ -132,10 +152,12 @@
 }
 
 SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch,
-                                           StringRef features) {
+                                           StringRef features, int optLevel) {
   maybeSetOption(this->triple, [&triple] { return triple.str(); });
   maybeSetOption(this->chip, [&arch] { return arch.str(); });
   maybeSetOption(this->features, [&features] { return features.str(); });
+  if (this->optLevel.getNumOccurrences() == 0)
+    this->optLevel.setValue(optLevel);
 }
 
 void SerializeToHsacoPass::getDependentDialects(
@@ -144,6 +166,30 @@
   gpu::SerializeToBlobPass::getDependentDialects(registry);
 }
 
+LogicalResult
+SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule,
+                                   llvm::TargetMachine &targetMachine) {
+  int optLevel = this->optLevel.getValue();
+  if (optLevel < 0 || optLevel > 3)
+    return getOperation().emitError()
+           << "Invalid HSA optimization level" << optLevel << "\n";
+
+  targetMachine.setOptLevel(static_cast<llvm::CodeGenOpt::Level>(optLevel));
+
+  auto transformer =
+      makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine);
+  auto error = transformer(&llvmModule);
+  if (error) {
+    InFlightDiagnostic mlirError = getOperation()->emitError();
+    llvm::handleAllErrors(
+        std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) {
+          mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n";
+        });
+    return mlirError;
+  }
+  return success();
+}
+
 std::unique_ptr<SmallVectorImpl<char>>
 SerializeToHsacoPass::assembleIsa(const std::string &isa) {
   auto loc = getOperation().getLoc();
@@ -170,8 +216,11 @@
   std::unique_ptr<llvm::MCAsmInfo> mai(
       target->createMCAsmInfo(*mri, this->triple, mcOptions));
   mai->setRelaxELFRelocations(true);
+  std::unique_ptr<llvm::MCSubtargetInfo> sti(
+      target->createMCSubtargetInfo(this->triple, this->chip, this->features));
 
-  llvm::MCContext ctx(triple, mai.get(), mri.get(), &srcMgr, &mcOptions);
+  llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr,
+                      &mcOptions);
   std::unique_ptr<llvm::MCObjectFileInfo> mofi(target->createMCObjectFileInfo(
       ctx, /*PIC=*/false, /*LargeCodeModel=*/false));
   ctx.setObjectFileInfo(mofi.get());
@@ -182,8 +231,6 @@
 
   std::unique_ptr<llvm::MCStreamer> mcStreamer;
   std::unique_ptr<llvm::MCInstrInfo> mcii(target->createMCInstrInfo());
-  std::unique_ptr<llvm::MCSubtargetInfo> sti(
-      target->createMCSubtargetInfo(this->triple, this->chip, this->features));
 
   llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, *mri, ctx);
   llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions);
@@ -280,7 +327,7 @@
         LLVMInitializeAMDGPUTargetMC();
 
         return std::make_unique<SerializeToHsacoPass>("amdgcn-amd-amdhsa", "",
-                                                      "");
+                                                      "", 2);
       });
 }
 #else  // MLIR_GPU_TO_HSACO_PASS_ENABLE
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -202,8 +202,11 @@
     ${HIP_PATH}/include
     ${ROCM_PATH}/include
   )
+  set_property(TARGET mlir_rocm_runtime
+    PROPERTY INSTALL_RPATH_USE_LINK_PATH ON)
+
   target_link_libraries(mlir_rocm_runtime
-    PRIVATE
+    PUBLIC
     ${ROCM_RUNTIME_LIBRARY}
   )
 endif()
diff --git a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
--- a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
+++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir
@@ -11,10 +11,10 @@
 func @other_func(%arg0 : f32, %arg1 : memref<?xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  %block_dim = dim %arg1, %c0 : memref<?xf32>
+  %block_dim = memref.dim %arg1, %c0 : memref<?xf32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) {
-    store %arg0, %arg1[%tx] : memref<?xf32>
+    memref.store %arg0, %arg1[%tx] : memref<?xf32>
     gpu.terminator
   }
   return
@@ -22,12 +22,12 @@
 
 // CHECK: [1, 1, 1, 1, 1]
 func @main() {
-  %arg0 = alloc() : memref<5xf32>
+  %arg0 = memref.alloc() : memref<5xf32>
   %21 = arith.constant 5 : i32
-  %22 = memref_cast %arg0 : memref<5xf32> to memref<?xf32>
-  %cast = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  %22 = memref.cast %arg0 : memref<5xf32> to memref<?xf32>
+  %cast = memref.cast %22 : memref<?xf32> to memref<*xf32>
   gpu.host_register %cast : memref<*xf32>
-  %23 = memref_cast %22 : memref<?xf32> to memref<*xf32>
+  %23 = memref.cast %22 : memref<?xf32> to memref<*xf32>
   call @print_memref_f32(%23) : (memref<*xf32>) -> ()
   %24 = arith.constant 1.0 : f32
   %25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref<?xf32>) -> (memref<?xf32>)
diff --git a/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
--- a/mlir/test/Integration/GPU/ROCM/two-modules.mlir
+++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir
@@ -10,24 +10,24 @@
 
 // CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
 func @main() {
-  %arg = alloc() : memref<13xi32>
-  %dst = memref_cast %arg : memref<13xi32> to memref<?xi32>
+  %arg = memref.alloc() : memref<13xi32>
+  %dst = memref.cast %arg : memref<13xi32> to memref<?xi32>
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  %sx = dim %dst, %c0 : memref<?xi32>
-  %cast_dst = memref_cast %dst : memref<?xi32> to memref<*xi32>
+  %sx = memref.dim %dst, %c0 : memref<?xi32>
+  %cast_dst = memref.cast %dst : memref<?xi32> to memref<*xi32>
   gpu.host_register %cast_dst : memref<*xi32>
   %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref<?xi32>) -> (memref<?xi32>)
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %c1, %block_z = %c1) {
     %t0 = arith.index_cast %tx : index to i32
-    store %t0, %dst_device[%tx] : memref<?xi32>
+    memref.store %t0, %dst_device[%tx] : memref<?xi32>
     gpu.terminator
   }
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %c1, %block_z = %c1) {
     %t0 = arith.index_cast %tx : index to i32
-    store %t0, %dst_device[%tx] : memref<?xi32>
+    memref.store %t0, %dst_device[%tx] : memref<?xi32>
     gpu.terminator
   }
   call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> ()
diff --git a/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
--- a/mlir/test/Integration/GPU/ROCM/vecadd.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir
@@ -12,13 +12,13 @@
 func @vecadd(%arg0 : memref<?xf32>, %arg1 : memref<?xf32>, %arg2 : memref<?xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  %block_dim = dim %arg0, %c0 : memref<?xf32>
+  %block_dim = memref.dim %arg0, %c0 : memref<?xf32>
   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
              threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) {
-    %a = load %arg0[%tx] : memref<?xf32>
-    %b = load %arg1[%tx] : memref<?xf32>
+    %a = memref.load %arg0[%tx] : memref<?xf32>
+    %b = memref.load %arg1[%tx] : memref<?xf32>
     %c = arith.addf %a, %b : f32
-    store %c, %arg2[%tx] : memref<?xf32>
+    memref.store %c, %arg2[%tx] : memref<?xf32>
     gpu.terminator
   }
   return
@@ -30,19 +30,19 @@
   %c1 = arith.constant 1 : index
   %c5 = arith.constant 5 : index
   %cf1dot23 = arith.constant 1.23 : f32
-  %0 = alloc() : memref<5xf32>
-  %1 = alloc() : memref<5xf32>
-  %2 = alloc() : memref<5xf32>
-  %3 = memref_cast %0 : memref<5xf32> to memref<?xf32>
-  %4 = memref_cast %1 : memref<5xf32> to memref<?xf32>
-  %5 = memref_cast %2 : memref<5xf32> to memref<?xf32>
+  %0 = memref.alloc() : memref<5xf32>
+  %1 = memref.alloc() : memref<5xf32>
+  %2 = memref.alloc() : memref<5xf32>
+  %3 = memref.cast %0 : memref<5xf32> to memref<?xf32>
+  %4 = memref.cast %1 : memref<5xf32> to memref<?xf32>
+  %5 = memref.cast %2 : memref<5xf32> to memref<?xf32>
   scf.for %i = %c0 to %c5 step %c1 {
-    store %cf1dot23, %3[%i] : memref<?xf32>
-    store %cf1dot23, %4[%i] : memref<?xf32>
+    memref.store %cf1dot23, %3[%i] : memref<?xf32>
+    memref.store %cf1dot23, %4[%i] : memref<?xf32>
   }
-  %6 = memref_cast %3 : memref<?xf32> to memref<*xf32>
-  %7 = memref_cast %4 : memref<?xf32> to memref<*xf32>
-  %8 = memref_cast %5 : memref<?xf32> to memref<*xf32>
+  %6 = memref.cast %3 : memref<?xf32> to memref<*xf32>
+  %7 = memref.cast %4 : memref<?xf32> to memref<*xf32>
+  %8 = memref.cast %5 : memref<?xf32> to memref<*xf32>
   gpu.host_register %6 : memref<*xf32>
   gpu.host_register %7 : memref<*xf32>
   gpu.host_register %8 : memref<*xf32>
diff --git a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
--- a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
+++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir
@@ -59,19 +59,19 @@
   %cf1 = arith.constant 1.0 : f32
   %cf1dot23 = arith.constant 1.23 : f32
 
-  %arg0 = alloc() : memref<4xf32>
-  %arg1 = alloc() : memref<4xf32>
+  %arg0 = memref.alloc() : memref<4xf32>
+  %arg1 = memref.alloc() : memref<4xf32>
 
-  %22 = memref_cast %arg0 : memref<4xf32> to memref<?xf32>
-  %23 = memref_cast %arg1 : memref<4xf32> to memref<?xf32>
+  %22 = memref.cast %arg0 : memref<4xf32> to memref<?xf32>
+  %23 = memref.cast %arg1 : memref<4xf32> to memref<?xf32>
 
   scf.for %i = %c0 to %c4 step %c1 {
-    store %cf1dot23, %22[%i] : memref<?xf32>
-    store %cf1dot23, %23[%i] : memref<?xf32>
+    memref.store %cf1dot23, %22[%i] : memref<?xf32>
+    memref.store %cf1dot23, %23[%i] : memref<?xf32>
   }
 
-  %cast0 = memref_cast %22 : memref<?xf32> to memref<*xf32>
-  %cast1 = memref_cast %23 : memref<?xf32> to memref<*xf32>
+  %cast0 = memref.cast %22 : memref<?xf32> to memref<*xf32>
+  %cast1 = memref.cast %23 : memref<?xf32> to memref<*xf32>
 
   gpu.host_register %cast0 : memref<*xf32>
   gpu.host_register %cast1 : memref<*xf32>