diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -86,7 +86,13 @@
   GV_Max_Warp_Number,
   /// The slot size that should be reserved for a working warp.
   /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
-  GV_Warp_Size_Log2_MaskL
+  GV_Warp_Size_Log2_MaskL,
+  /// Total number of vector registers per CU or SM
+  GV_Vector_Register_Count,
+  /// Total number of scalar registers per CU or SM
+  GV_Scalar_Register_Count,
+  /// Total shared memory size per CU or SM in bytes
+  GV_Shared_Memory_Size
 };
 
 /// For AMDGPU GPUs
@@ -104,7 +110,10 @@
     1024,      // GV_Max_WG_Size,
     256,       // GV_Defaut_WG_Size
     1024 / 64, // GV_Max_WG_Size / GV_WarpSize
-    63         // GV_Warp_Size_Log2_MaskL
+    63,        // GV_Warp_Size_Log2_MaskL
+    64 * 1024, // GV_Vector_Register_Count
+    4 * 800,   // GV_Scalar_Register_Count
+    64 * 1024  // GV_Shared_Memory_Size
 };
 
 /// For Nvidia GPUs
@@ -122,7 +131,10 @@
     1024,              // GV_Max_WG_Size
     128,               // GV_Defaut_WG_Size
     1024 / 32,         // GV_Max_WG_Size / GV_WarpSize
-    31                 // GV_Warp_Size_Log2_MaskL
+    31,                // GV_Warp_Size_Log2_MaskL
+    64 * 1024,         // GV_Vector_Register_Count
+    0,                 // GV_Scalar_Register_Count (not applicable)
+    32 * 1024          // GV_Shared_Memory_Size (configurable)
 };
 
 } // namespace omp
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -70,6 +70,15 @@
 }
 }
 
+// Number of SIMDs in a CU
+static const uint32_t NumSIMDsPerCU = 4;
+// Heuristic parameters used for kernel launch parameters
+// Default number of waves per team is chosen equal to the number of SIMDs
+static const uint32_t DefaultNumWavesPerTeam = NumSIMDsPerCU;
+// Default number of teams per CU is chosen for scheduling flexibility within a
+// SIMD
+static const uint32_t DefaultNumTeamsPerCU = 4;
+
 int print_kernel_trace;
 
 #ifdef OMPTARGET_DEBUG
@@ -494,14 +503,22 @@
   static const unsigned HardTeamLimit =
       (1 << 16) - 1; // 64K needed to fit in uint16
   static const int DefaultNumTeams = 128;
-  static const int Max_Teams =
-      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
   static const int Warp_Size =
       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+  static const int Max_Teams =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
   static const int Max_WG_Size =
       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
   static const int Default_WG_Size =
       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
+  static const int Max_Warp_Number =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Warp_Number];
+  static const int Vector_Register_Count = llvm::omp::AMDGPUGpuGridValues
+      [llvm::omp::GVIDX::GV_Vector_Register_Count];
+  static const int Scalar_Register_Count = llvm::omp::AMDGPUGpuGridValues
+      [llvm::omp::GVIDX::GV_Scalar_Register_Count];
+  static const int LDS_Size =
+      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Shared_Memory_Size];
 
   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
                                       size_t size, hsa_agent_t);
@@ -1876,6 +1893,83 @@
   return OFFLOAD_SUCCESS;
 }
 
+/*
+  Determine total number of teams in a kernel and the number of
+  threads in each team in order to maximize occupancy.
+
+  Here is the overall idea: We aim to schedule a certain number of
+  wavefronts, expressed as the product of the number of teams and the
+  number of wavefronts per team. Each CU has 4 SIMDs. To account for
+  each SIMD, we start with at least 4 wavefronts in a team. If the
+  total number of teams per CU is constrained by LDS usage, we try to
+  increase the number of wavefronts per team. If the total number of
+  wavefronts per SIMD is constrained by VGPR or SGPR usage, we reduce
+  the number of teams while keeping the number of wavefronts unchanged.
+*/
+void adjustTeamsAndThreadsBasedOnResources(int *NumTeams, int *NumThreads,
+                                           uint32_t lds_usage,
+                                           uint32_t sgpr_count,
+                                           uint32_t vgpr_count,
+                                           int32_t device_id) {
+  // Initialize the number of waves per team to the default
+  uint32_t num_waves_per_team = DefaultNumWavesPerTeam;
+
+  // lds_usage is reported per workgroup (i.e. team). So initialize
+  // the number of teams per CU based on lds_usage
+  uint32_t num_teams_per_cu =
+      std::min(RTLDeviceInfoTy::LDS_Size / (lds_usage ? lds_usage : 1),
+               DefaultNumTeamsPerCU);
+
+  // Compute the maximum number of waves per SIMD based on VGPR and SGPR usage
+  uint32_t vgprs_avail_per_simd =
+      RTLDeviceInfoTy::Vector_Register_Count / NumSIMDsPerCU;
+  // vgpr_count is per workitem (i.e. thread)
+  uint32_t vgpr_usage_per_wave = vgpr_count * RTLDeviceInfoTy::Warp_Size;
+  uint32_t vgpr_constrained_max_waves_per_simd =
+      vgprs_avail_per_simd / (vgpr_usage_per_wave ? vgpr_usage_per_wave : 1);
+
+  uint32_t sgprs_avail_per_simd =
+      RTLDeviceInfoTy::Scalar_Register_Count / NumSIMDsPerCU;
+  // sgpr_count is per wavefront
+  uint32_t sgpr_constrained_max_waves_per_simd =
+      sgprs_avail_per_simd / (sgpr_count ? sgpr_count : 1);
+
+  uint32_t max_waves_per_simd =
+      std::min(std::min(vgpr_constrained_max_waves_per_simd,
+                        sgpr_constrained_max_waves_per_simd),
+               DefaultNumTeamsPerCU);
+
+  uint32_t default_occupancy_factor =
+      DefaultNumTeamsPerCU * DefaultNumWavesPerTeam;
+  // Compute occupancy factor based on constraints from LDS usage
+  uint32_t lds_constrained_occupancy_factor =
+      num_teams_per_cu * DefaultNumWavesPerTeam;
+  // Compute occupancy factor based on constraints from VGPR and SGPR usage
+  uint32_t gpr_constrained_occupancy_factor =
+      max_waves_per_simd * DefaultNumWavesPerTeam;
+
+  // First, we examine whether LDS is limiting the number of teams,
+  // regardless of any limits imposed by GPR usage
+  if (lds_constrained_occupancy_factor < default_occupancy_factor) {
+    // No benefit in increasing num_teams_per_cu
+    // But try to increase the number of waves per team subject to
+    // constraints imposed by GPR usage
+    num_waves_per_team = std::min(gpr_constrained_occupancy_factor /
+                                      (num_teams_per_cu ? num_teams_per_cu : 1),
+                                  (uint32_t)RTLDeviceInfoTy::Max_Warp_Number);
+  } else if (gpr_constrained_occupancy_factor < default_occupancy_factor) {
+    // Not much can be done, this kernel will run with reduced
+    // occupancy. Lower the number of teams to reflect that aspect.
+    num_teams_per_cu = std::min(num_teams_per_cu, max_waves_per_simd);
+  }
+
+  // Total number of teams in the kernel
+  *NumTeams = num_teams_per_cu * DeviceInfo.ComputeUnits[device_id];
+
+  // Total number of threads in each team
+  *NumThreads = num_waves_per_team * RTLDeviceInfoTy::Warp_Size;
+}
+
 // Determine launch values for threadsPerGroup and num_groups.
 // Outputs: treadsPerGroup, num_groups
 // Inputs: Max_Teams, Max_WG_Size, Warp_Size, ExecutionMode,
@@ -1884,8 +1978,22 @@
 void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize,
                    int ExecutionMode, int EnvTeamLimit, int EnvNumTeams,
                    int num_teams, int thread_limit, uint64_t loop_tripcount,
+                   uint32_t lds_usage, uint32_t sgpr_count, uint32_t vgpr_count,
                    int32_t device_id) {
 
+  // In non-generic mode, if the user did not specify number of teams
+  // or threads, adjust them based on resources
+  int NumTeamsBasedOnResources = 0;
+  int NumThreadsBasedOnResources = 0;
+  if (ExecutionMode != GENERIC && EnvNumTeams <= 0 && EnvTeamLimit <= 0 &&
+      DeviceInfo.EnvTeamThreadLimit <= 0 &&
+      DeviceInfo.EnvMaxTeamsDefault <= 0 && num_teams <= 0 &&
+      thread_limit <= 0) {
+    adjustTeamsAndThreadsBasedOnResources(
+        &NumTeamsBasedOnResources, &NumThreadsBasedOnResources, lds_usage,
+        sgpr_count, vgpr_count, device_id);
+  }
+
   int Max_Teams = DeviceInfo.EnvMaxTeamsDefault > 0
                       ? DeviceInfo.EnvMaxTeamsDefault
                       : DeviceInfo.NumTeams[device_id];
@@ -1896,6 +2004,7 @@
     fprintf(stderr, "RTLDeviceInfoTy::Max_Teams: %d\n",
             RTLDeviceInfoTy::Max_Teams);
     fprintf(stderr, "Max_Teams: %d\n", Max_Teams);
+    fprintf(stderr, "NumTeamsBasedOnResources: %d\n", NumTeamsBasedOnResources);
     fprintf(stderr, "RTLDeviceInfoTy::Warp_Size: %d\n",
             RTLDeviceInfoTy::Warp_Size);
     fprintf(stderr, "RTLDeviceInfoTy::Max_WG_Size: %d\n",
@@ -1904,8 +2013,21 @@
             RTLDeviceInfoTy::Default_WG_Size);
     fprintf(stderr, "thread_limit: %d\n", thread_limit);
     fprintf(stderr, "threadsPerGroup: %d\n", threadsPerGroup);
+    fprintf(stderr, "NumThreadsBasedOnResources: %d\n",
+            NumThreadsBasedOnResources);
     fprintf(stderr, "ConstWGSize: %d\n", ConstWGSize);
   }
+
+  if (NumTeamsBasedOnResources > 0 &&
+      NumTeamsBasedOnResources <= DeviceInfo.HardTeamLimit &&
+      NumThreadsBasedOnResources > 0 &&
+      NumThreadsBasedOnResources <= RTLDeviceInfoTy::Max_WG_Size) {
+    Max_Teams = NumTeamsBasedOnResources;
+    threadsPerGroup = NumThreadsBasedOnResources;
+    DP("Modifying Max_Teams based on resources: %d\n", Max_Teams);
+    DP("Modifying threadsPerGroup based on resources: %d\n", threadsPerGroup);
+  }
+
   // check for thread_limit() clause
   if (thread_limit > 0) {
     threadsPerGroup = thread_limit;
@@ -2105,6 +2227,7 @@
                 num_teams,      // From run_region arg
                 thread_limit,   // From run_region arg
                 loop_tripcount, // From run_region arg
+                group_segment_size, sgpr_count, vgpr_count,
                 KernelInfo->device_id);
 
   if (print_kernel_trace >= LAUNCH) {