diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -13,6 +13,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cuda.h>
+#include <fstream>
+#include <iostream>
 #include <list>
 #include <memory>
 #include <mutex>
@@ -77,6 +79,35 @@
   NONE
 };
 
+struct FunctionPropertiesInfo {
+  /// Number of basic blocks
+  int64_t BasicBlockCount = 0;
+
+  /// Number of blocks reached from a conditional instruction, or that are
+  /// 'cases' of a SwitchInstr.
+  int64_t BlocksReachedFromConditionalInstruction = 0;
+
+  /// Number of uses of this function, plus 1 if the function is callable
+  /// outside the module.
+  int64_t Uses = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  // Load Instruction Count
+  int64_t LoadInstCount = 0;
+
+  // Store Instruction Count
+  int64_t StoreInstCount = 0;
+
+  // Maximum Loop Depth in the Function
+  int64_t MaxLoopDepth = 0;
+
+  // Number of Top Level Loops in the Function
+  int64_t TopLevelLoopCount = 0;
+};
+
 /// Use a single entity to encode a kernel and a set of flags.
 struct KernelTy {
   CUfunction Func;
@@ -87,11 +118,14 @@
   // 2 - SPMD mode execution with Generic mode semantics.
   int8_t ExecutionMode;
 
+  /// Numeric features of the kernel as extracted by FunctionPropertiesAnalysis
+  FunctionPropertiesInfo KernelFeatures;
+
   /// Maximal number of threads per block for this kernel.
   int MaxThreadsPerBlock = 0;
 
-  KernelTy(CUfunction _Func, int8_t _ExecutionMode)
-      : Func(_Func), ExecutionMode(_ExecutionMode) {}
+  KernelTy(CUfunction _Func, int8_t _ExecutionMode, FunctionPropertiesInfo _KernelFeatures)
+      : Func(_Func), ExecutionMode(_ExecutionMode), KernelFeatures(_KernelFeatures) {}
 };
 
 /// Device environment data
@@ -810,7 +844,30 @@
         CUDA_ERR_STRING(Err);
       }
 
-      KernelsList.emplace_back(Func, ExecModeVal);
+      std::string KernelFeaturesNameStr(E->name);
+      KernelFeaturesNameStr += "_KernelFeatures";
+      const char *KernelFeaturesName = KernelFeaturesNameStr.c_str();
+      CUdeviceptr KernelFeaturesPtr;
+      size_t KernelFeaturesSize;
+      Err = cuModuleGetGlobal(&KernelFeaturesPtr, &KernelFeaturesSize, Module, KernelFeaturesName);
+      FunctionPropertiesInfo KernelFeatures;
+      if (Err == CUDA_SUCCESS) {
+        Err = cuMemcpyDtoH(&KernelFeatures, KernelFeaturesPtr, KernelFeaturesSize);
+        assert(sizeof(KernelFeatures) == KernelFeaturesSize);
+        if (Err == CUDA_SUCCESS) {
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Kernel parameters for `%s`:\n", E->name);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Basic block count                 = %ld\n", KernelFeatures.BasicBlockCount);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Blocks reached from conditional   = %ld\n", KernelFeatures.BlocksReachedFromConditionalInstruction);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Direct calls to defined functions = %ld\n", KernelFeatures.DirectCallsToDefinedFunctions);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Load instruction count            = %ld\n", KernelFeatures.LoadInstCount);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Max loop depth                    = %ld\n", KernelFeatures.MaxLoopDepth);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Store instruction count           = %ld\n", KernelFeatures.StoreInstCount);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Top-level loop count              = %ld\n", KernelFeatures.TopLevelLoopCount);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Uses                              = %ld\n", KernelFeatures.Uses);
+        }
+      }
+
+      KernelsList.emplace_back(Func, ExecModeVal, KernelFeatures);
 
       __tgt_offload_entry Entry = *E;
       Entry.addr = &KernelsList.back();
@@ -1086,22 +1143,132 @@
       CudaBlocksPerGrid = TeamNum;
     }
 
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Launching kernel %s with %d blocks and %d threads in %s "
-         "mode\n",
-         (getOffloadEntry(DeviceId, TgtEntryPtr))
-             ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
-             : "(null)",
-         CudaBlocksPerGrid, CudaThreadsPerBlock,
-         (KernelInfo->ExecutionMode != SPMD 
-             ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic")
-             : "SPMD"));
-
     CUstream Stream = getStream(DeviceId, AsyncInfo);
-    Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
-                         /* gridDimZ */ 1, CudaThreadsPerBlock,
-                         /* blockDimY */ 1, /* blockDimZ */ 1,
-                         /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+    // The environment variable BENCHMARK_KERNEL serves as a flag for enabling/disabling the benchmarking functionality.
+    // If BENCHMARK_KERNEL is set, the kernel is executed with different grid parameters and the results are appended
+    // into a file given by the environment variable BENCHMARK_KERNEL_LOG.
+    // Otherwise the kernel is executed normally.
+    if (std::getenv("BENCHMARK_KERNEL")) {
+      const char* KernelName = getOffloadEntry(DeviceId, TgtEntryPtr) ? getOffloadEntry(DeviceId, TgtEntryPtr)->name : "(null)";
+      // The range for "number of blocks per grid" is evaluated as follows:
+      // - If parameter TeamNum is set to a positive value, then the range degenerates into TeamNum.
+      // - Otherwise the range is given as all powers of 2 from 1 to maximum supported number of blocks per grid.
+      // E.g., if the devide supports up to 65536 blocks per grid, then the benchmarking code will run the kernel
+      // with 1 block per grid, 2 blocks per grid, 4 blocks per grid, ... 32768 blocks per grid and 65536 blocks per grid.
+      int MinBlocksPerGrid, MaxBlocksPerGrid;
+      if (TeamNum <= 0) {
+        MinBlocksPerGrid = 1;
+        MaxBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Running kernel `%s` using from %d to %d CUDA blocks\n",
+             KernelName, MinBlocksPerGrid, MaxBlocksPerGrid);
+      }
+      else {
+        MinBlocksPerGrid = MaxBlocksPerGrid = CudaBlocksPerGrid;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Running kernel `%s` using the requested number of CUDA blocks (%d)\n",
+             KernelName, CudaBlocksPerGrid);
+      }
+      // The range for "number of blocks per grid" is defined similarly to the range for "number of threads per block".
+      int MinThreadsPerBlock, MaxThreadsPerBlock;
+      if (ThreadLimit <= 0) {
+        MinThreadsPerBlock = 1;
+        MaxThreadsPerBlock = DeviceData[DeviceId].NumThreads;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Running kernel `%s` using from %d to %d threads\n",
+             KernelName, MinThreadsPerBlock, MaxThreadsPerBlock);
+      }
+      else {
+        MinThreadsPerBlock = MaxThreadsPerBlock = CudaThreadsPerBlock;
+        INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+             "Running kernel `%s` using the requested number of threads (%d)\n",
+             KernelName, CudaThreadsPerBlock);
+      }
+      // The log file is assumed to be a CSV file using the following columns:
+      // - KernelName,
+      // - 1 column for each of the kernel features,
+      // - 1 column for each of the runtime parameters (currently only BlocksPerGrid and ThreadsPerBlock),
+      // - ExecutionMs
+      // Each row of the file represents a single run of the kernel which is identified both by a kernel
+      // and by runtime parameters
+      char* LogFileName = std::getenv("BENCHMARK_KERNEL_LOG");
+      std::ofstream LogFileOutputStream;
+      std::ifstream LogFileInputStream;
+      if (LogFileName) {
+        LogFileOutputStream.open(LogFileName, std::ios_base::app);
+        LogFileInputStream.open(LogFileName);
+        if (LogFileInputStream.peek() == std::fstream::traits_type::eof()) {
+          LogFileOutputStream << "KernelName,BasicBlocks,BlocksReachedFromConditionals,DirectCallsToDefinedFunctions,"
+                              << "LoadInsts,MaxLoopDepth,StoreInsts,TopLevelLoops,Uses,"
+                              << "BlocksPerGrid,ThreadsPerBlock,ExecutionMs\n";
+          LogFileOutputStream.flush();
+        }
+        LogFileInputStream.close();
+      }
+      for (int TrialThreadsPerBlock = MaxThreadsPerBlock; TrialThreadsPerBlock >= MinThreadsPerBlock; TrialThreadsPerBlock /= 2) {
+        for (int TrialBlocksPerGrid = MaxBlocksPerGrid; TrialBlocksPerGrid >= MinBlocksPerGrid; TrialBlocksPerGrid /= 2) {
+          // The measurement of the runtime is implemented via CUDA events;
+          // see also https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/
+          CUevent StartEvent, StopEvent;
+          cuEventCreate(&StartEvent, CU_EVENT_DEFAULT);
+          cuEventCreate(&StopEvent, CU_EVENT_DEFAULT);
+          cuEventRecord(StartEvent, Stream);
+          CUresult trialErr = cuLaunchKernel(KernelInfo->Func, TrialBlocksPerGrid, /* gridDimY */ 1,
+                                             /* gridDimZ */ 1, TrialThreadsPerBlock,
+                                             /* blockDimY */ 1, /* blockDimZ */ 1,
+                                             /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+          if (Err == CUDA_SUCCESS)
+            Err = trialErr;
+          cuEventRecord(StopEvent, Stream);
+          cuEventSynchronize(StopEvent);
+          float ExecutionMs = 0;
+          cuEventElapsedTime(&ExecutionMs, StartEvent, StopEvent);
+          cuEventDestroy(StartEvent);
+          cuEventDestroy(StopEvent);
+          INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+              "Kernel %s executed in %Ld ms with %d blocks and %d threads in %s "
+              "mode\n",
+              KernelName,
+              (long long)ExecutionMs,
+              TrialBlocksPerGrid, TrialThreadsPerBlock,
+              (KernelInfo->ExecutionMode != SPMD 
+                  ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic")
+                  : "SPMD"));
+          if (LogFileOutputStream.is_open()) {
+            LogFileOutputStream << KernelName << ","
+                                << KernelInfo->KernelFeatures.BasicBlockCount << ","
+                                << KernelInfo->KernelFeatures.BlocksReachedFromConditionalInstruction << ","
+                                << KernelInfo->KernelFeatures.DirectCallsToDefinedFunctions << ","
+                                << KernelInfo->KernelFeatures.LoadInstCount << ","
+                                << KernelInfo->KernelFeatures.MaxLoopDepth << ","
+                                << KernelInfo->KernelFeatures.StoreInstCount << ","
+                                << KernelInfo->KernelFeatures.TopLevelLoopCount << ","
+                                << KernelInfo->KernelFeatures.Uses << ","
+                                << TrialBlocksPerGrid << "," << TrialThreadsPerBlock << "," << ExecutionMs << "\n";
+            LogFileOutputStream.flush();
+          }
+        }
+      }
+      if (LogFileOutputStream.is_open()) {
+        LogFileOutputStream.close();
+      }
+    }
+    else {
+      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
+          "Launching kernel %s with %d blocks and %d threads in %s "
+          "mode\n",
+          (getOffloadEntry(DeviceId, TgtEntryPtr))
+              ? getOffloadEntry(DeviceId, TgtEntryPtr)->name
+              : "(null)",
+          CudaBlocksPerGrid, CudaThreadsPerBlock,
+          (KernelInfo->ExecutionMode != SPMD 
+              ? (KernelInfo->ExecutionMode == GENERIC ? "Generic" : "SPMD-Generic")
+              : "SPMD"));
+      Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
+                          /* gridDimZ */ 1, CudaThreadsPerBlock,
+                          /* blockDimY */ 1, /* blockDimZ */ 1,
+                          /* sharedMemBytes */ 0, Stream, &Args[0], nullptr);
+    }
     if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
       return OFFLOAD_FAIL;