diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -205,6 +205,9 @@
           /* kmp_task_t */ VoidPtr, Int32,
           /* kmp_task_affinity_info_t */ VoidPtr)
 
+__OMP_RTL(__kmpc_get_hardware_num_blocks, false, Int32, )
+__OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
+
 __OMP_RTL(omp_get_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
@@ -596,6 +599,9 @@
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs,
                            AttributeSet(), ReadOnlyPtrAttrs))
 
+__OMP_RTL_ATTRS(__kmpc_get_hardware_num_blocks, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, AttributeSet(), ParamAttrs())
+
 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs())
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1763,6 +1763,10 @@
     return Changed == ChangeStatus::CHANGED;
   }
 
+  /// Populate the Attributor with abstract attribute opportunities in the
+  /// function.
+  void registerFoldRuntimeCall(RuntimeFunction RF);
+
   /// Populate the Attributor with abstract attribute opportunities in the
   /// function.
   void registerAAs(bool IsModulePass);
@@ -3358,6 +3362,8 @@
     case OMPRTL___kmpc_is_spmd_exec_mode:
     case OMPRTL___kmpc_for_static_fini:
     case OMPRTL___kmpc_global_thread_num:
+    case OMPRTL___kmpc_get_hardware_num_threads_in_block:
+    case OMPRTL___kmpc_get_hardware_num_blocks:
     case OMPRTL___kmpc_single:
     case OMPRTL___kmpc_end_single:
     case OMPRTL___kmpc_master:
@@ -3511,11 +3517,16 @@
 
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
     switch (RFKind) {
     case OMPRTL___kmpc_is_spmd_exec_mode:
       Changed = Changed | foldIsSPMDExecMode(A);
       break;
+    case OMPRTL___kmpc_get_hardware_num_threads_in_block:
+      Changed = Changed | foldHardwareNumThreads(A);
+      break;
+    case OMPRTL___kmpc_get_hardware_num_blocks:
+      Changed = Changed | foldHardwareNumTeams(A);
+      break;
     default:
       llvm_unreachable("Unhandled OpenMP runtime function!");
     }
@@ -3546,6 +3557,7 @@
     auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
         *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
+    // What actual kernels reach this function
     for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
       auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
                                           DepClassTy::REQUIRED);
@@ -3599,6 +3611,78 @@
                                      : ChangeStatus::CHANGED;
   }
 
+  /// Fold __kmpc_get_hardware_num_blocks into a constant if possible.
+  /// The value is an attribute in the kernel
+  ChangeStatus foldHardwareNumTeams(Attributor &A) {
+    // Specialize only if all the calls agree with the number of teams
+    int32_t CurrentNumTeams = -1;
+    BooleanState StateBefore = getState();
+
+    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
+
+    // What actual kernels reach this function
+    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
+      if (K->hasFnAttribute("NumTeams")) {
+        int32_t NumT = 
+            std::stoi(K->getFnAttribute("NumTeams").getValueAsString().str());
+        if (CurrentNumTeams != -1 && CurrentNumTeams != NumT) {
+          SimplifiedValue = nullptr;
+          return indicatePessimisticFixpoint();
+        } else {
+          CurrentNumTeams = NumT;
+        }
+      } else {
+        // TODO: No attribute, then default?
+      }
+    }
+
+    if (CurrentNumTeams != -1) {
+      auto &Ctx = getAnchorValue().getContext();
+      SimplifiedValue = 
+          ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumTeams);
+    }
+
+    return getState() == StateBefore ? ChangeStatus::UNCHANGED
+                                     : ChangeStatus::CHANGED;
+  }
+
+  /// Fold __kmpc_get_hardware_num_threads_in_block into a constant if possible.
+  /// The value is an attribute in the kernel
+  ChangeStatus foldHardwareNumThreads(Attributor &A) {
+    // Specialize only if all the calls agree with the number of threads
+    int32_t CurrentNumThreads = -1;
+    BooleanState StateBefore = getState();
+
+    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
+
+    // What actual kernels reach this function
+    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
+      if (K->hasFnAttribute("NumThreads")) {
+        int32_t NumT = 
+            std::stoi(K->getFnAttribute("NumThreads").getValueAsString().str());
+        if (CurrentNumThreads != -1 && CurrentNumThreads != NumT) {
+          SimplifiedValue = nullptr;
+          return indicatePessimisticFixpoint();
+        } else {
+          CurrentNumThreads = NumT;
+        }
+      } else {
+        // TODO: No attribute, then default?
+      }
+    }
+
+    if (CurrentNumThreads != -1) {
+      auto &Ctx = getAnchorValue().getContext();
+      SimplifiedValue = 
+          ConstantInt::get(Type::getInt8Ty(Ctx), CurrentNumThreads);
+    }
+    
+    return getState() == StateBefore ? ChangeStatus::UNCHANGED
+                                     : ChangeStatus::CHANGED;
+  }
+
   /// An optional value the associated value is assumed to fold to. That is, we
   /// assume the associated value (which is a call) can be replaced by this
   /// simplified value.
@@ -3610,6 +3694,21 @@
 
 } // namespace
 
+/// Register folding callsite
+void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
+  auto &RFI = OMPInfoCache.RFIs[RF];
+  RFI.foreachUse(SCC, [&](Use &U, Function &) {
+    CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+    if (!CI)
+      return false;
+    A.getOrCreateAAFor<AAFoldRuntimeCall>(
+        IRPosition::callsite_function(*CI), /* QueryingAA */ nullptr,
+        DepClassTy::NONE, /* ForceUpdate */ false,
+        /* UpdateAfterInit */ false);
+    return false;
+  });
+}
+
 void OpenMPOpt::registerAAs(bool IsModulePass) {
   if (SCC.empty())
 
@@ -3619,23 +3718,16 @@
     // update. This will make sure we register all value simplification
     // callbacks before any other AA has the chance to create an AAValueSimplify
     // or similar.
-    for (Function *Kernel : OMPInfoCache.Kernels)
+    for (Function *Kernel : OMPInfoCache.Kernels) {
       A.getOrCreateAAFor<AAKernelInfo>(
           IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
           DepClassTy::NONE, /* ForceUpdate */ false,
           /* UpdateAfterInit */ false);
+    }
 
-    auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode];
-    IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) {
-      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI);
-      if (!CI)
-        return false;
-      A.getOrCreateAAFor<AAFoldRuntimeCall>(
-          IRPosition::callsite_function(*CI), /* QueryingAA */ nullptr,
-          DepClassTy::NONE, /* ForceUpdate */ false,
-          /* UpdateAfterInit */ false);
-      return false;
-    });
+    registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
+    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
+    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
   }
 
   // Create CallSite AA for all Getters.
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+target triple = "nvptx64"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@G = external global i32
+;.
+; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
+;.
+define weak void @kernel0() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel0() #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper0()
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @helper2()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+  call void @helper0()
+  call void @helper1()
+  call void @helper2()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
+  ret void
+}
+
+define weak void @kernel1() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel1() #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+  call void @helper1()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+  ret void
+}
+
+define weak void @kernel2() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel2() #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper0()
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @helper2()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+  call void @helper0()
+  call void @helper1()
+  call void @helper2()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+  ret void
+}
+
+define internal void @helper0() {
+; CHECK-LABEL: define {{[^@]+}}@helper0() {
+; CHECK-NEXT:    store i32 666, i32* @G, align 1
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  store i32 %threadLimit, i32* @G
+  ret void
+}
+
+define internal void @helper1() {
+; CHECK-LABEL: define {{[^@]+}}@helper1() {
+; CHECK-NEXT:    br label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    unreachable
+; CHECK:       f:
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  %c = icmp eq i32 %threadLimit, 666
+  br i1 %c, label %t, label %f
+t:
+  call void @helper0()
+  ret void
+f:
+  ret void
+}
+
+define internal void @helper2() {
+; CHECK-LABEL: define {{[^@]+}}@helper2() {
+; CHECK-NEXT:    store i32 666, i8* @G
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  store i32 %threadLimit, i32* @G
+  ret void
+}
+
+declare i32 @__kmpc_get_hardware_num_threads_in_block()
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
+declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
+
+
+!llvm.module.flags = !{!0, !1}
+!nvvm.annotations = !{!2, !3, !4}
+
+attributes #0 = { "NumThreads"="666" "NumTeams"="777"}
+
+!0 = !{i32 7, !"openmp", i32 50}
+!1 = !{i32 7, !"openmp-device", i32 50}
+!2 = !{void ()* @kernel0, !"kernel", i32 1}
+!3 = !{void ()* @kernel1, !"kernel", i32 1}
+!4 = !{void ()* @kernel2, !"kernel", i32 1}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1}
+; CHECK: [[META3:![0-9]+]] = !{void ()* @kernel1, !"kernel", i32 1}
+; CHECK: [[META4:![0-9]+]] = !{void ()* @kernel2, !"kernel", i32 1}
+;.
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -122,12 +122,12 @@
 }
 } // namespace
 
-EXTERN int GetNumberOfBlocksInKernel() {
+EXTERN int __kmpc_get_hardware_num_blocks() {
   return get_grid_dim(__builtin_amdgcn_grid_size_x(),
                       __builtin_amdgcn_workgroup_size_x());
 }
 
-EXTERN int GetNumberOfThreadsInBlock() {
+EXTERN int __kmpc_get_hardware_num_threads_in_block() {
   return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
                            __builtin_amdgcn_grid_size_x(),
                            __builtin_amdgcn_workgroup_size_x());
@@ -140,7 +140,7 @@
 }
 
 EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
-  return GetNumberOfThreadsInBlock();
+  return __kmpc_get_hardware_num_threads_in_block();
 }
 
 // Atomics
diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu
@@ -61,7 +61,7 @@
 
 EXTERN int omp_get_thread_limit(void) {
   if (__kmpc_is_spmd_exec_mode())
-    return GetNumberOfThreadsInBlock();
+    return __kmpc_get_hardware_num_threads_in_block();
   int rc = threadLimit;
   PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
   return rc;
@@ -196,7 +196,7 @@
 
 EXTERN int omp_get_team_size(int level) {
   if (__kmpc_is_spmd_exec_mode())
-    return level == 1 ? GetNumberOfThreadsInBlock() : 1;
+    return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1;
   int rc = -1;
   unsigned parLevel = parallelLevel[GetWarpId()];
   // If level is 0 or all parallel regions are not active - return 1.
diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu
@@ -94,7 +94,7 @@
 
   if (GetLaneId() == 0) {
     parallelLevel[GetWarpId()] =
-        1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
+        1 + (__kmpc_get_hardware_num_threads_in_block() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0);
   }
 
   __kmpc_data_sharing_init_stack();
diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -197,7 +197,7 @@
       __kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
                          : /*Master thread only*/ 1;
   uint32_t TeamId = GetBlockIdInKernel();
-  uint32_t NumTeams = GetNumberOfBlocksInKernel();
+  uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
   static unsigned SHARED(Bound);
   static unsigned SHARED(ChunkTeamCount);
 
diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu
@@ -53,7 +53,7 @@
 //
 // Called in Generic Execution Mode only.
 int GetMasterThreadID() {
-  return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1);
+  return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1);
 }
 
 // The last warp is reserved for the master; other warps are workers.
@@ -109,7 +109,7 @@
   if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
     rc = 1;
   } else if (isSPMDExecutionMode) {
-    rc = GetNumberOfThreadsInBlock();
+    rc = __kmpc_get_hardware_num_threads_in_block();
   } else {
     rc = threadsInTeam;
   }
@@ -127,7 +127,7 @@
 
 int GetNumberOfOmpTeams() {
   // omp_num_teams
-  return GetNumberOfBlocksInKernel(); // assume 1 block per team
+  return __kmpc_get_hardware_num_blocks(); // assume 1 block per team
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -169,7 +169,7 @@
 int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
   if (!isSPMDExecutionMode)
     return GetNumberOfWorkersInTeam();
-  return GetNumberOfThreadsInBlock();
+  return __kmpc_get_hardware_num_threads_in_block();
 }
 
 int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
--- a/openmp/libomptarget/deviceRTLs/common/support.h
+++ b/openmp/libomptarget/deviceRTLs/common/support.h
@@ -50,8 +50,8 @@
 int GetOmpTeamId();                           // omp_team_num
 
 // get OpenMP number of threads and team
-int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
-int GetNumberOfOmpTeams();                           // omp_num_teams
+NOINLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
+NOINLINE int GetNumberOfOmpTeams();                           // omp_num_teams
 
 // get OpenMP number of procs
 int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -94,10 +94,10 @@
 // Calls to the NVPTX layer (assuming 1D layout)
 EXTERN int GetThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
 EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
-EXTERN int GetNumberOfBlocksInKernel() {
+EXTERN int __kmpc_get_hardware_num_blocks() {
   return __nvvm_read_ptx_sreg_nctaid_x();
 }
-EXTERN int GetNumberOfThreadsInBlock() { return __nvvm_read_ptx_sreg_ntid_x(); }
+EXTERN int __kmpc_get_hardware_num_threads_in_block() { return __nvvm_read_ptx_sreg_ntid_x(); }
 EXTERN unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; }
 EXTERN unsigned GetWarpSize() { return WARPSIZE; }
 EXTERN unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); }
diff --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -18,8 +18,8 @@
 // Calls to the NVPTX layer (assuming 1D layout)
 EXTERN int GetThreadIdInBlock();
 EXTERN int GetBlockIdInKernel();
-EXTERN int GetNumberOfBlocksInKernel();
-EXTERN int GetNumberOfThreadsInBlock();
+EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
+EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
 EXTERN unsigned GetWarpId();
 EXTERN unsigned GetWarpSize();
 EXTERN unsigned GetLaneId();