Index: flang/docs/Extensions.md
===================================================================
--- flang/docs/Extensions.md
+++ flang/docs/Extensions.md
@@ -63,6 +63,13 @@
   not the bounds of the implied DO loop.  It is not advisable to use
   an object of the same name as the index variable in a bounds
   expression, but it will work, instead of being needlessly undefined.
+* If both the `COUNT=` and the `COUNT_MAX=` optional arguments are
+  present on the same call to the intrinsic subroutine `SYSTEM_CLOCK`,
+  we require that their types have the same integer kind, since the
+  kind of these arguments is used to select the clock rate.
+  In common with some other compilers, the clock is in milliseconds
+  for kinds <= 4 and nanoseconds otherwise where the target system
+  supports these rates.
 
 ## Extensions, deletions, and legacy features supported by default
 
Index: flang/include/flang/Runtime/time-intrinsic.h
===================================================================
--- flang/include/flang/Runtime/time-intrinsic.h
+++ flang/include/flang/Runtime/time-intrinsic.h
@@ -12,8 +12,8 @@
 #ifndef FORTRAN_RUNTIME_TIME_INTRINSIC_H_
 #define FORTRAN_RUNTIME_TIME_INTRINSIC_H_
 
-#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/entry-names.h"
+#include <cinttypes>
 
 namespace Fortran::runtime {
 
@@ -27,10 +27,14 @@
 
 // Interface for the SYSTEM_CLOCK intrinsic. We break it up into 3 distinct
 // function calls, one for each of SYSTEM_CLOCK's optional output arguments.
-// Lowering will have to cast the results to whatever type it prefers.
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCount)();
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountRate)();
-CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountMax)();
+// Lowering converts the results to the types of the actual arguments,
+// including the case of a real argument for COUNT_RATE=..
+// The kind argument to SystemClockCount and SystemClockCountMax is the
+// kind of the integer actual arguments, which are required to be the same
+// when both appear.
+std::int64_t RTNAME(SystemClockCount)(int kind = 8);
+std::int64_t RTNAME(SystemClockCountRate)(int kind = 8);
+std::int64_t RTNAME(SystemClockCountMax)(int kind = 8);
 
 // Interface for DATE_AND_TIME intrinsic.
 void RTNAME(DateAndTime)(char *date, std::size_t dateChars, char *time,
Index: flang/lib/Evaluate/intrinsics.cpp
===================================================================
--- flang/lib/Evaluate/intrinsics.cpp
+++ flang/lib/Evaluate/intrinsics.cpp
@@ -1127,11 +1127,11 @@
                 common::Intent::Out}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"system_clock",
-        {{"count", AnyInt, Rank::scalar, Optionality::optional,
+        {{"count", SameInt, Rank::scalar, Optionality::optional,
              common::Intent::Out},
             {"count_rate", AnyIntOrReal, Rank::scalar, Optionality::optional,
                 common::Intent::Out},
-            {"count_max", AnyInt, Rank::scalar, Optionality::optional,
+            {"count_max", SameInt, Rank::scalar, Optionality::optional,
                 common::Intent::Out}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
 };
Index: flang/runtime/time-intrinsic.cpp
===================================================================
--- flang/runtime/time-intrinsic.cpp
+++ flang/runtime/time-intrinsic.cpp
@@ -11,6 +11,7 @@
 #include "flang/Runtime/time-intrinsic.h"
 #include "terminator.h"
 #include "tools.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
 #include <algorithm>
 #include <cstdint>
@@ -51,18 +52,10 @@
   if (timestamp != static_cast<std::clock_t>(-1)) {
     return static_cast<double>(timestamp) / CLOCKS_PER_SEC;
   }
-
   // Return some negative value to represent failure.
   return -1.0;
 }
 
-// POSIX implementation using clock_gettime. This is only enabled if
-// clock_gettime is available.
-template <typename T = int, typename U = struct timespec>
-double GetCpuTime(preferred_implementation,
-    // We need some dummy parameters to pass to decltype(clock_gettime).
-    T ClockId = 0, U *Timespec = nullptr,
-    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
 #if defined CLOCK_THREAD_CPUTIME_ID
 #define CLOCKID CLOCK_THREAD_CPUTIME_ID
 #elif defined CLOCK_PROCESS_CPUTIME_ID
@@ -72,106 +65,119 @@
 #else
 #define CLOCKID CLOCK_REALTIME
 #endif
+
+// POSIX implementation using clock_gettime. This is only enabled where
+// clock_gettime is available.
+template <typename T = int, typename U = struct timespec>
+double GetCpuTime(preferred_implementation,
+    // We need some dummy parameters to pass to decltype(clock_gettime).
+    T ClockId = 0, U *Timespec = nullptr,
+    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
   struct timespec tspec;
   if (clock_gettime(CLOCKID, &tspec) == 0) {
     return tspec.tv_nsec * 1.0e-9 + tspec.tv_sec;
   }
-
   // Return some negative value to represent failure.
   return -1.0;
 }
 
-using count_t =
-    Fortran::runtime::CppTypeFor<Fortran::common::TypeCategory::Integer, 8>;
+using count_t = std::int64_t;
+using unsigned_count_t = std::uint64_t;
+
+// Computes HUGE(INT(0,kind)) as an unsigned integer value.
+static constexpr inline unsigned_count_t GetHUGE(int kind) {
+  if (kind > 8) {
+    kind = 8;
+  }
+  return (unsigned_count_t{1} << ((8 * kind) - 1)) - 1;
+}
 
 // This is the fallback implementation, which should work everywhere. Note that
 // in general we can't recover after std::clock has reached its maximum value.
 template <typename Unused = void>
-count_t GetSystemClockCount(fallback_implementation) {
+count_t GetSystemClockCount(int kind, fallback_implementation) {
   std::clock_t timestamp{std::clock()};
   if (timestamp == static_cast<std::clock_t>(-1)) {
-    // Return -HUGE() to represent failure.
-    return -std::numeric_limits<count_t>::max();
+    // Return -HUGE(COUNT) to represent failure.
+    return -static_cast<count_t>(GetHUGE(kind));
   }
-
-  // If our return type is large enough to hold any value returned by
-  // std::clock, our work is done. Otherwise, we have to wrap around.
-  static constexpr auto max{std::numeric_limits<count_t>::max()};
-  if constexpr (std::numeric_limits<std::clock_t>::max() <= max) {
-    return static_cast<count_t>(timestamp);
-  } else {
-    // Since std::clock_t could be a floating point type, we can't just use the
-    // % operator, so we have to wrap around manually.
-    return static_cast<count_t>(timestamp - max * std::floor(timestamp / max));
+  // Convert the timestamp to std::uint64_t with wrap-around. The timestamp is
+  // most likely a floating-point value (since C'11), so compute the modulus
+  // carefully when one is required.
+  constexpr auto maxUnsignedCount{std::numeric_limits<unsigned_count_t>::max()};
+  if constexpr (std::numeric_limits<std::clock_t>::max() > maxUnsignedCount) {
+    timestamp -= maxUnsignedCount * std::floor(timestamp / maxUnsignedCount);
   }
+  unsigned_count_t unsignedCount{static_cast<unsigned_count_t>(timestamp)};
+  // Return the modulus of the unsigned integral count with HUGE(COUNT).
+  // The result is a signed integer but never negative.
+  return static_cast<count_t>(unsignedCount % GetHUGE(kind));
 }
 
 template <typename Unused = void>
-count_t GetSystemClockCountRate(fallback_implementation) {
+count_t GetSystemClockCountRate(int kind, fallback_implementation) {
   return CLOCKS_PER_SEC;
 }
 
 template <typename Unused = void>
-count_t GetSystemClockCountMax(fallback_implementation) {
-  static constexpr auto max_clock_t = std::numeric_limits<std::clock_t>::max();
-  static constexpr auto max_count_t = std::numeric_limits<count_t>::max();
-  if constexpr (max_clock_t < max_count_t) {
-    return static_cast<count_t>(max_clock_t);
-  } else {
-    return max_count_t;
-  }
+count_t GetSystemClockCountMax(int kind, fallback_implementation) {
+  constexpr auto max_clock_t{std::numeric_limits<std::clock_t>::max()};
+  unsigned_count_t maxCount{GetHUGE(kind)};
+  return max_clock_t <= maxCount ? static_cast<count_t>(max_clock_t)
+                                 : static_cast<count_t>(maxCount);
 }
 
-constexpr count_t NSECS_PER_SEC{1'000'000'000};
+// POSIX implementation using clock_gettime. This is only enabled where
+// clock_gettime is available.  Use a millisecond CLOCK_RATE for kinds
+// of COUNT/COUNT_MAX less than 64 bits, and nanoseconds otherwise.
+constexpr unsigned_count_t MILLIS_PER_SEC{1'000u};
+constexpr unsigned_count_t NSECS_PER_SEC{1'000'000'000u};
+constexpr unsigned_count_t maxSecs{
+    std::numeric_limits<unsigned_count_t>::max() / NSECS_PER_SEC};
+
+// Use a millisecond clock rate for smaller COUNT= kinds.
+static inline unsigned_count_t ScaleResult(unsigned_count_t nsecs, int kind) {
+  return kind >= 8 ? nsecs : nsecs / (NSECS_PER_SEC / MILLIS_PER_SEC);
+}
 
-// POSIX implementation using clock_gettime. This is only enabled if
-// clock_gettime is available.
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCount(preferred_implementation,
+count_t GetSystemClockCount(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-#if defined CLOCK_THREAD_CPUTIME_ID
-#define CLOCKID CLOCK_THREAD_CPUTIME_ID
-#elif defined CLOCK_PROCESS_CPUTIME_ID
-#define CLOCKID CLOCK_PROCESS_CPUTIME_ID
-#elif defined CLOCK_MONOTONIC
-#define CLOCKID CLOCK_MONOTONIC
-#else
-#define CLOCKID CLOCK_REALTIME
-#endif
   struct timespec tspec;
   if (clock_gettime(CLOCKID, &tspec) != 0) {
     // Return -HUGE() to represent failure.
-    return -std::numeric_limits<count_t>::max();
+    return -GetHUGE(kind);
   }
-
   // Wrap around to avoid overflows.
-  constexpr count_t max_secs{
-      std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
-  count_t wrapped_secs{tspec.tv_sec % max_secs};
-
-  // At this point, wrapped_secs < max_secs, and max_secs has already been
-  // truncated by the division. Therefore, we should still have enough room to
-  // add tv_nsec, since it is < NSECS_PER_SEC.
-  return tspec.tv_nsec + wrapped_secs * NSECS_PER_SEC;
+  unsigned_count_t wrappedSecs{
+      static_cast<unsigned_count_t>(tspec.tv_sec) % maxSecs};
+  unsigned_count_t unsignedNsecs{static_cast<unsigned_count_t>(tspec.tv_nsec) +
+      wrappedSecs * NSECS_PER_SEC};
+  unsigned_count_t unsignedCount{ScaleResult(unsignedNsecs, kind)};
+  // Return the modulus of the unsigned integral count with HUGE(COUNT).
+  // The result is a signed integer but never negative.
+  return static_cast<count_t>(unsignedCount % GetHUGE(kind));
 }
 
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCountRate(preferred_implementation,
+count_t GetSystemClockCountRate(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-  return NSECS_PER_SEC;
+  return kind >= 8 ? static_cast<count_t>(NSECS_PER_SEC) : MILLIS_PER_SEC;
 }
 
 template <typename T = int, typename U = struct timespec>
-count_t GetSystemClockCountMax(preferred_implementation,
+count_t GetSystemClockCountMax(int kind, preferred_implementation,
     // We need some dummy parameters to pass to decltype(clock_gettime).
     T ClockId = 0, U *Timespec = nullptr,
     decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
-  count_t max_secs{std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
-  return max_secs * NSECS_PER_SEC - 1;
+  unsigned_count_t maxClockNsec{maxSecs * NSECS_PER_SEC + NSECS_PER_SEC - 1};
+  unsigned_count_t maxClock{ScaleResult(maxClockNsec, kind)};
+  unsigned_count_t maxCount{GetHUGE(kind) - 1};
+  return static_cast<count_t>(maxClock <= maxCount ? maxClock : maxCount);
 }
 
 // DATE_AND_TIME (Fortran 2018 16.9.59)
@@ -198,7 +204,7 @@
 
 // Default implementation when date and time information is not available (set
 // strings to blanks and values to -HUGE as defined by the standard).
-void DateAndTimeUnavailable(Fortran::runtime::Terminator &terminator,
+static void DateAndTimeUnavailable(Fortran::runtime::Terminator &terminator,
     char *date, std::size_t dateChars, char *time, std::size_t timeChars,
     char *zone, std::size_t zoneChars,
     const Fortran::runtime::Descriptor *values) {
@@ -259,9 +265,9 @@
   };
 };
 
-// Dispatch to posix implemetation when gettimeofday and localtime_r are
+// Dispatch to posix implementation where gettimeofday and localtime_r are
 // available.
-void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
+static void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
     std::size_t dateChars, char *time, std::size_t timeChars, char *zone,
     std::size_t zoneChars, const Fortran::runtime::Descriptor *values) {
 
@@ -330,9 +336,9 @@
 }
 
 #else
-// Fallback implementation when gettimeofday or localtime_r is not available
-// (e.g. windows).
-void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
+// Fallback implementation where gettimeofday or localtime_r are not both
+// available (e.g. windows).
+static void GetDateAndTime(Fortran::runtime::Terminator &terminator, char *date,
     std::size_t dateChars, char *time, std::size_t timeChars, char *zone,
     std::size_t zoneChars, const Fortran::runtime::Descriptor *values) {
   // TODO: An actual implementation for non Posix system should be added.
@@ -342,26 +348,23 @@
       terminator, date, dateChars, time, timeChars, zone, zoneChars, values);
 }
 #endif
-} // anonymous namespace
+} // namespace
 
 namespace Fortran::runtime {
 extern "C" {
 
 double RTNAME(CpuTime)() { return GetCpuTime(0); }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCount)() {
-  return GetSystemClockCount(0);
+std::int64_t RTNAME(SystemClockCount)(int kind) {
+  return GetSystemClockCount(kind, 0);
 }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCountRate)() {
-  return GetSystemClockCountRate(0);
+std::int64_t RTNAME(SystemClockCountRate)(int kind) {
+  return GetSystemClockCountRate(kind, 0);
 }
 
-CppTypeFor<Fortran::common::TypeCategory::Integer, 8> RTNAME(
-    SystemClockCountMax)() {
-  return GetSystemClockCountMax(0);
+std::int64_t RTNAME(SystemClockCountMax)(int kind) {
+  return GetSystemClockCountMax(kind, 0);
 }
 
 void RTNAME(DateAndTime)(char *date, std::size_t dateChars, char *time,
Index: flang/unittests/Runtime/Time.cpp
===================================================================
--- flang/unittests/Runtime/Time.cpp
+++ flang/unittests/Runtime/Time.cpp
@@ -31,7 +31,7 @@
   }
 }
 
-using count_t = CppTypeFor<TypeCategory::Integer, 8>;
+using count_t = std::int64_t;
 
 TEST(TimeIntrinsics, SystemClock) {
   // We can't really test that we get the "right" result for SYSTEM_CLOCK, but
@@ -43,21 +43,46 @@
   // SYSTEM_CLOCK.
   EXPECT_GT(RTNAME(SystemClockCountRate)(), 0);
 
-  count_t max{RTNAME(SystemClockCountMax)()};
-  EXPECT_GT(max, 0);
+  count_t max1{RTNAME(SystemClockCountMax)(1)};
+  EXPECT_GT(max1, 0);
+  EXPECT_LE(max1, static_cast<count_t>(0x7f));
+  count_t start1{RTNAME(SystemClockCount)(1)};
+  EXPECT_GE(start1, 0);
+  EXPECT_LE(start1, max1);
 
-  count_t start{RTNAME(SystemClockCount)()};
-  EXPECT_GE(start, 0);
-  EXPECT_LE(start, max);
+  count_t max2{RTNAME(SystemClockCountMax)(2)};
+  EXPECT_GT(max2, 0);
+  EXPECT_LE(max2, static_cast<count_t>(0x7fff));
+  count_t start2{RTNAME(SystemClockCount)(2)};
+  EXPECT_GE(start2, 0);
+  EXPECT_LE(start2, max2);
+
+  count_t max4{RTNAME(SystemClockCountMax)(4)};
+  EXPECT_GT(max4, 0);
+  EXPECT_LE(max4, static_cast<count_t>(0x7fffffff));
+  count_t start4{RTNAME(SystemClockCount)(4)};
+  EXPECT_GE(start4, 0);
+  EXPECT_LE(start4, max4);
+
+  count_t max8{RTNAME(SystemClockCountMax)(8)};
+  EXPECT_GT(max8, 0);
+  count_t start8{RTNAME(SystemClockCount)(8)};
+  EXPECT_GE(start8, 0);
+  EXPECT_LT(start8, max8);
+
+  count_t max16{RTNAME(SystemClockCountMax)(16)};
+  EXPECT_GT(max16, 0);
+  count_t start16{RTNAME(SystemClockCount)(16)};
+  EXPECT_GE(start16, 0);
+  EXPECT_LT(start16, max16);
 
   // Loop until we get a different value from SystemClockCount. If we don't get
   // one before we time out, then we should probably look into an implementation
   // for SystemClokcCount with a better timer resolution on this platform.
-  for (count_t end = start; end == start; end = RTNAME(SystemClockCount)()) {
+  for (count_t end{start8}; end == start8; end = RTNAME(SystemClockCount)(8)) {
     EXPECT_GE(end, 0);
-    EXPECT_LE(end, max);
-
-    EXPECT_GE(end, start);
+    EXPECT_LE(end, max8);
+    EXPECT_GE(end, start8);
   }
 }