diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h
--- a/flang/include/flang/Evaluate/traverse.h
+++ b/flang/include/flang/Evaluate/traverse.h
@@ -38,6 +38,7 @@
 // expression of an ASSOCIATE (or related) construct entity.
 
 #include "expression.h"
+#include "flang/Common/indirection.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/type.h"
 #include <set>
@@ -53,6 +54,10 @@
   Result operator()(const common::Indirection<A, C> &x) const {
     return visitor_(x.value());
   }
+  template <typename A>
+  Result operator()(const common::ForwardOwningPointer<A> &p) const {
+    return visitor_(p.get());
+  }
   template <typename _> Result operator()(const SymbolRef x) const {
     return visitor_(*x);
   }
@@ -76,13 +81,17 @@
       return visitor_.Default();
     }
   }
-  template <typename... A>
-  Result operator()(const std::variant<A...> &u) const {
-    return common::visit(visitor_, u);
+  template <typename... As>
+  Result operator()(const std::variant<As...> &u) const {
+    return common::visit([=](const auto &y) { return visitor_(y); }, u);
   }
   template <typename A> Result operator()(const std::vector<A> &x) const {
     return CombineContents(x);
   }
+  template <typename A, typename B>
+  Result operator()(const std::pair<A, B> &x) const {
+    return Combine(x.first, x.second);
+  }
 
   // Leaves
   Result operator()(const BOZLiteralConstant &) const {
@@ -233,14 +242,24 @@
   template <typename T> Result operator()(const Expr<T> &x) const {
     return visitor_(x.u);
   }
+  Result operator()(const Assignment &x) const {
+    return Combine(x.lhs, x.rhs, x.u);
+  }
+  Result operator()(const Assignment::Intrinsic &) const {
+    return visitor_.Default();
+  }
+  Result operator()(const GenericExprWrapper &x) const { return visitor_(x.v); }
+  Result operator()(const GenericAssignmentWrapper &x) const {
+    return visitor_(x.v);
+  }
 
 private:
   template <typename ITER> Result CombineRange(ITER iter, ITER end) const {
     if (iter == end) {
       return visitor_.Default();
     } else {
-      Result result{visitor_(*iter++)};
-      for (; iter != end; ++iter) {
+      Result result{visitor_(*iter)};
+      for (++iter; iter != end; ++iter) {
         result = visitor_.Combine(std::move(result), visitor_(*iter));
       }
       return result;
diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h
--- a/flang/include/flang/Parser/tools.h
+++ b/flang/include/flang/Parser/tools.h
@@ -65,6 +65,18 @@
     return common::visit([](const auto &y) { return Unwrap<A>(y); }, x);
   }
 
+  template <typename A, std::size_t J = 0, typename... Bs>
+  static const A *Unwrap(const std::tuple<Bs...> &x) {
+    if constexpr (J < sizeof...(Bs)) {
+      if (auto result{Unwrap<A>(std::get<J>(x))}) {
+        return result;
+      }
+      return Unwrap<A, (J + 1)>(x);
+    } else {
+      return nullptr;
+    }
+  }
+
   template <typename A, typename B>
   static const A *Unwrap(const std::optional<B> &o) {
     if (o) {
@@ -122,5 +134,120 @@
 template <typename A>
 struct HasTypedExpr<A, decltype(static_cast<void>(A::typedExpr), 0)>
     : std::true_type {};
+
+// GetSource()
+
+template <bool GET_FIRST> struct GetSourceHelper {
+
+  using Result = std::optional<CharBlock>;
+
+  template <typename A> static Result GetSource(A *p) {
+    if (p) {
+      return GetSource(*p);
+    } else {
+      return std::nullopt;
+    }
+  }
+  template <typename A>
+  static Result GetSource(const common::Indirection<A> &x) {
+    return GetSource(x.value());
+  }
+
+  template <typename A, bool COPY>
+  static Result GetSource(const common::Indirection<A, COPY> &x) {
+    return GetSource(x.value());
+  }
+
+  template <typename... As>
+  static Result GetSource(const std::variant<As...> &x) {
+    return common::visit([](const auto &y) { return GetSource(y); }, x);
+  }
+
+  template <std::size_t J = 0, typename... As>
+  static Result GetSource(const std::tuple<As...> &x) {
+    if constexpr (J < sizeof...(As)) {
+      constexpr std::size_t index{GET_FIRST ? J : sizeof...(As) - J - 1};
+      if (auto result{GetSource(std::get<index>(x))}) {
+        return result;
+      }
+      return GetSource<(J + 1)>(x);
+    } else {
+      return {};
+    }
+  }
+
+  template <typename A> static Result GetSource(const std::optional<A> &o) {
+    if (o) {
+      return GetSource(*o);
+    } else {
+      return {};
+    }
+  }
+
+  template <typename A> static Result GetSource(const std::list<A> &x) {
+    if constexpr (GET_FIRST) {
+      for (const A &y : x) {
+        if (auto result{GetSource(y)}) {
+          return result;
+        }
+      }
+    } else {
+      for (auto iter{x.rbegin()}; iter != x.rend(); ++iter) {
+        if (auto result{GetSource(*iter)}) {
+          return result;
+        }
+      }
+    }
+    return {};
+  }
+
+  template <typename A> static Result GetSource(const std::vector<A> &x) {
+    if constexpr (GET_FIRST) {
+      for (const A &y : x) {
+        if (auto result{GetSource(y)}) {
+          return result;
+        }
+      }
+    } else {
+      for (auto iter{x.rbegin()}; iter != x.rend(); ++iter) {
+        if (auto result{GetSource(*iter)}) {
+          return result;
+        }
+      }
+    }
+    return {};
+  }
+
+  template <typename A> static Result GetSource(A &x) {
+    if constexpr (HasSource<A>::value) {
+      return x.source;
+    } else if constexpr (ConstraintTrait<A>) {
+      return GetSource(x.thing);
+    } else if constexpr (WrapperTrait<A>) {
+      return GetSource(x.v);
+    } else if constexpr (UnionTrait<A>) {
+      return GetSource(x.u);
+    } else if constexpr (TupleTrait<A>) {
+      return GetSource(x.t);
+    } else {
+      return {};
+    }
+  }
+};
+
+template <typename A> std::optional<CharBlock> GetSource(const A &x) {
+  return GetSourceHelper<true>::GetSource(x);
+}
+template <typename A> std::optional<CharBlock> GetSource(A &x) {
+  return GetSourceHelper<true>::GetSource(const_cast<const A &>(x));
+}
+
+template <typename A> std::optional<CharBlock> GetLastSource(const A &x) {
+  return GetSourceHelper<false>::GetSource(x);
+}
+template <typename A> std::optional<CharBlock> GetLastSource(A &x) {
+  return GetSourceHelper<false>::GetSource(const_cast<const A &>(x));
+}
+
 } // namespace Fortran::parser
 #endif // FORTRAN_PARSER_TOOLS_H_
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -214,8 +214,9 @@
   // Defines builtinsScope_ from the __Fortran_builtins module
   void UseFortranBuiltinsModule();
   const Scope *GetBuiltinsScope() const { return builtinsScope_; }
+
   void UsePPCFortranBuiltinTypesModule();
-  const Scope *GetCUDABuiltinsScope();
+  const Scope &GetCUDABuiltinsScope();
   void UsePPCFortranBuiltinsModule();
   Scope *GetPPCBuiltinTypesScope() { return ppcBuiltinTypesScope_; }
   const Scope *GetPPCBuiltinsScope() const { return ppcBuiltinsScope_; }
@@ -281,7 +282,7 @@
   std::set<std::string> tempNames_;
   const Scope *builtinsScope_{nullptr}; // module __Fortran_builtins
   Scope *ppcBuiltinTypesScope_{nullptr}; // module __Fortran_PPC_types
-  std::optional<const Scope *> CUDABuiltinsScope_; // module __CUDA_builtins
+  std::optional<const Scope *> cudaBuiltinsScope_; // module __CUDA_builtins
   const Scope *ppcBuiltinsScope_{nullptr}; // module __Fortran_PPC_intrinsics
   std::list<parser::Program> modFileParseTrees_;
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -1698,7 +1698,7 @@
     Put('('), Walk(std::get<std::list<ActualArgSpec>>(x.v.t), ", "), Put(')');
   }
   void Unparse(const CallStmt &x) { // R1521
-    if (asFortran_ && x.typedCall.get() && !x.chevrons /*CUDA todo*/) {
+    if (asFortran_ && x.typedCall.get()) {
       Put(' ');
       asFortran_->call(out_, *x.typedCall);
       Put('\n');
diff --git a/flang/lib/Semantics/CMakeLists.txt b/flang/lib/Semantics/CMakeLists.txt
--- a/flang/lib/Semantics/CMakeLists.txt
+++ b/flang/lib/Semantics/CMakeLists.txt
@@ -10,6 +10,7 @@
   check-call.cpp
   check-case.cpp
   check-coarray.cpp
+  check-cuda.cpp
   check-data.cpp
   check-deallocate.cpp
   check-declarations.cpp
diff --git a/flang/lib/Semantics/canonicalize-acc.cpp b/flang/lib/Semantics/canonicalize-acc.cpp
--- a/flang/lib/Semantics/canonicalize-acc.cpp
+++ b/flang/lib/Semantics/canonicalize-acc.cpp
@@ -65,7 +65,7 @@
 
         const auto &outer{std::get<std::optional<parser::DoConstruct>>(x.t)};
         if (outer->IsDoConcurrent()) {
-          return; // Tile is not allowed on DO CONURRENT
+          return; // Tile is not allowed on DO CONCURRENT
         }
         for (const parser::DoConstruct *loop{&*outer}; loop && tileArgNb > 0;
              --tileArgNb) {
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -31,6 +31,8 @@
   bool gotTypeSpec{false};
   bool gotSource{false};
   bool gotMold{false};
+  bool gotStream{false};
+  bool gotPinned{false};
 };
 
 class AllocationCheckerHelper {
@@ -179,8 +181,22 @@
               parserSourceExpr = &mold.v.value();
               info.gotMold = true;
             },
-            [](const parser::AllocOpt::Stream &) { /* CUDA coming */ },
-            [](const parser::AllocOpt::Pinned &) { /* CUDA coming */ },
+            [&](const parser::AllocOpt::Stream &stream) { // CUDA
+              if (info.gotStream) {
+                context.Say(
+                    "STREAM may not be duplicated in a ALLOCATE statement"_err_en_US);
+                stopCheckingAllocate = true;
+              }
+              info.gotStream = true;
+            },
+            [&](const parser::AllocOpt::Pinned &pinned) { // CUDA
+              if (info.gotPinned) {
+                context.Say(
+                    "PINNED may not be duplicated in a ALLOCATE statement"_err_en_US);
+                stopCheckingAllocate = true;
+              }
+              info.gotPinned = true;
+            },
         },
         allocOpt.u);
   }
@@ -569,12 +585,13 @@
     return false;
   }
   context.CheckIndexVarRedefine(name_);
+  const Scope &subpScope{
+      GetProgramUnitContaining(context.FindScope(name_.source))};
   if (allocateObject_.typedExpr && allocateObject_.typedExpr->v) {
-    if (auto whyNot{
-            WhyNotDefinable(name_.source, context.FindScope(name_.source),
-                {DefinabilityFlag::PointerDefinition,
-                    DefinabilityFlag::AcceptAllocatable},
-                *allocateObject_.typedExpr->v)}) {
+    if (auto whyNot{WhyNotDefinable(name_.source, subpScope,
+            {DefinabilityFlag::PointerDefinition,
+                DefinabilityFlag::AcceptAllocatable},
+            *allocateObject_.typedExpr->v)}) {
       context
           .Say(name_.source,
               "Name in ALLOCATE statement is not definable"_err_en_US)
diff --git a/flang/lib/Semantics/check-cuda.h b/flang/lib/Semantics/check-cuda.h
new file mode 100644
--- /dev/null
+++ b/flang/lib/Semantics/check-cuda.h
@@ -0,0 +1,50 @@
+//===-- lib/Semantics/check-cuda.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_SEMANTICS_CHECK_CUDA_H_
+#define FORTRAN_SEMANTICS_CHECK_CUDA_H_
+
+#include "flang/Semantics/semantics.h"
+#include <list>
+
+namespace Fortran::parser {
+struct Program;
+class Messages;
+struct Name;
+class CharBlock;
+struct ExecutionPartConstruct;
+struct ExecutableConstruct;
+struct ActionStmt;
+struct IfConstruct;
+struct CUFKernelDoConstruct;
+struct SubroutineSubprogram;
+struct FunctionSubprogram;
+struct SeparateModuleSubprogram;
+} // namespace Fortran::parser
+
+namespace Fortran::semantics {
+
+class SemanticsContext;
+
+class CUDAChecker : public virtual BaseChecker {
+public:
+  explicit CUDAChecker(SemanticsContext &c) : context_{c} {}
+  void Enter(const parser::SubroutineSubprogram &);
+  void Enter(const parser::FunctionSubprogram &);
+  void Enter(const parser::SeparateModuleSubprogram &);
+  void Enter(const parser::CUFKernelDoConstruct &);
+
+private:
+  SemanticsContext &context_;
+};
+
+bool CanonicalizeCUDA(parser::Program &);
+
+} // namespace Fortran::semantics
+
+#endif // FORTRAN_SEMANTICS_CHECK_CUDA_H_
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
new file mode 100644
--- /dev/null
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -0,0 +1,416 @@
+//===-- lib/Semantics/check-cuda.cpp ----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "check-cuda.h"
+#include "flang/Common/template.h"
+#include "flang/Evaluate/fold.h"
+#include "flang/Evaluate/traverse.h"
+#include "flang/Parser/parse-tree-visitor.h"
+#include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
+#include "flang/Semantics/expression.h"
+#include "flang/Semantics/symbol.h"
+
+// Once labeled DO constructs have been canonicalized and their parse subtrees
+// transformed into parser::DoConstructs, scan the parser::Blocks of the program
+// and merge adjacent CUFKernelDoConstructs and DoConstructs whenever the
+// CUFKernelDoConstruct doesn't already have an embedded DoConstruct.  Also
+// emit errors about improper or missing DoConstructs.
+
+namespace Fortran::parser {
+struct Mutator {
+  template <typename A> bool Pre(A &) { return true; }
+  template <typename A> void Post(A &) {}
+  bool Pre(Block &);
+};
+
+bool Mutator::Pre(Block &block) {
+  for (auto iter{block.begin()}; iter != block.end(); ++iter) {
+    if (auto *kernel{Unwrap<CUFKernelDoConstruct>(*iter)}) {
+      auto &nested{std::get<std::optional<DoConstruct>>(kernel->t)};
+      if (!nested) {
+        if (auto next{iter}; ++next != block.end()) {
+          if (auto *doConstruct{Unwrap<DoConstruct>(*next)}) {
+            nested = std::move(*doConstruct);
+            block.erase(next);
+          }
+        }
+      }
+    } else {
+      Walk(*iter, *this);
+    }
+  }
+  return false;
+}
+} // namespace Fortran::parser
+
+namespace Fortran::semantics {
+
+bool CanonicalizeCUDA(parser::Program &program) {
+  parser::Mutator mutator;
+  parser::Walk(program, mutator);
+  return true;
+}
+
+using MaybeMsg = std::optional<parser::MessageFormattedText>;
+
+// Traverses an evaluate::Expr<> in search of unsupported operations
+// on the device.
+
+struct DeviceExprChecker
+    : public evaluate::AnyTraverse<DeviceExprChecker, MaybeMsg> {
+  using Result = MaybeMsg;
+  using Base = evaluate::AnyTraverse<DeviceExprChecker, Result>;
+  DeviceExprChecker() : Base(*this) {}
+  using Base::operator();
+  Result operator()(const evaluate::ProcedureDesignator &x) const {
+    if (const Symbol * sym{x.GetInterfaceSymbol()}) {
+      const auto *subp{
+          sym->GetUltimate().detailsIf<semantics::SubprogramDetails>()};
+      if (subp) {
+        if (auto attrs{subp->cudaSubprogramAttrs()}) {
+          if (*attrs == common::CUDASubprogramAttrs::HostDevice ||
+              *attrs == common::CUDASubprogramAttrs::Device) {
+            return {};
+          }
+        }
+      }
+    } else if (x.GetSpecificIntrinsic()) {
+      // TODO(CUDA): Check for unsupported intrinsics here
+      return {};
+    }
+    return parser::MessageFormattedText(
+        "'%s' may not be called in device code"_err_en_US, x.GetName());
+  }
+};
+
+template <typename A> static MaybeMsg CheckUnwrappedExpr(const A &x) {
+  if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
+    return DeviceExprChecker{}(expr->typedExpr);
+  }
+  return {};
+}
+
+template <typename A>
+static void CheckUnwrappedExpr(
+    SemanticsContext &context, SourceName at, const A &x) {
+  if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
+    if (auto msg{DeviceExprChecker{}(expr->typedExpr)}) {
+      context.Say(at, std::move(*msg));
+    }
+  }
+}
+
+template <bool CUF_KERNEL> struct ActionStmtChecker {
+  template <typename A> static MaybeMsg WhyNotOk(const A &x) {
+    if constexpr (ConstraintTrait<A>) {
+      return WhyNotOk(x.thing);
+    } else if constexpr (WrapperTrait<A>) {
+      return WhyNotOk(x.v);
+    } else if constexpr (UnionTrait<A>) {
+      return WhyNotOk(x.u);
+    } else if constexpr (TupleTrait<A>) {
+      return WhyNotOk(x.t);
+    } else {
+      return parser::MessageFormattedText{
+          "Statement may not appear in device code"_err_en_US};
+    }
+  }
+  template <typename A>
+  static MaybeMsg WhyNotOk(const common::Indirection<A> &x) {
+    return WhyNotOk(x.value());
+  }
+  template <typename... As>
+  static MaybeMsg WhyNotOk(const std::variant<As...> &x) {
+    return common::visit([](const auto &x) { return WhyNotOk(x); }, x);
+  }
+  template <std::size_t J = 0, typename... As>
+  static MaybeMsg WhyNotOk(const std::tuple<As...> &x) {
+    if constexpr (J == sizeof...(As)) {
+      return {};
+    } else if (auto msg{WhyNotOk(std::get<J>(x))}) {
+      return msg;
+    } else {
+      return WhyNotOk<(J + 1)>(x);
+    }
+  }
+  template <typename A> static MaybeMsg WhyNotOk(const std::list<A> &x) {
+    for (const auto &y : x) {
+      if (MaybeMsg result{WhyNotOk(y)}) {
+        return result;
+      }
+    }
+    return {};
+  }
+  template <typename A> static MaybeMsg WhyNotOk(const std::optional<A> &x) {
+    if (x) {
+      return WhyNotOk(*x);
+    } else {
+      return {};
+    }
+  }
+  template <typename A>
+  static MaybeMsg WhyNotOk(const parser::UnlabeledStatement<A> &x) {
+    return WhyNotOk(x.statement);
+  }
+  template <typename A>
+  static MaybeMsg WhyNotOk(const parser::Statement<A> &x) {
+    return WhyNotOk(x.statement);
+  }
+  static MaybeMsg WhyNotOk(const parser::AllocateStmt &) {
+    return {}; // AllocateObjects are checked elsewhere
+  }
+  static MaybeMsg WhyNotOk(const parser::AllocateCoarraySpec &) {
+    return parser::MessageFormattedText(
+        "A coarray may not be allocated on the device"_err_en_US);
+  }
+  static MaybeMsg WhyNotOk(const parser::DeallocateStmt &) {
+    return {}; // AllocateObjects are checked elsewhere
+  }
+  static MaybeMsg WhyNotOk(const parser::AssignmentStmt &x) {
+    return DeviceExprChecker{}(x.typedAssignment);
+  }
+  static MaybeMsg WhyNotOk(const parser::CallStmt &x) {
+    return DeviceExprChecker{}(x.typedCall);
+  }
+  static MaybeMsg WhyNotOk(const parser::ContinueStmt &) { return {}; }
+  static MaybeMsg WhyNotOk(const parser::IfStmt &x) {
+    if (auto result{
+            CheckUnwrappedExpr(std::get<parser::ScalarLogicalExpr>(x.t))}) {
+      return result;
+    }
+    return WhyNotOk(
+        std::get<parser::UnlabeledStatement<parser::ActionStmt>>(x.t)
+            .statement);
+  }
+  static MaybeMsg WhyNotOk(const parser::NullifyStmt &x) {
+    for (const auto &y : x.v) {
+      if (MaybeMsg result{DeviceExprChecker{}(y.typedExpr)}) {
+        return result;
+      }
+    }
+    return {};
+  }
+  static MaybeMsg WhyNotOk(const parser::PointerAssignmentStmt &x) {
+    return DeviceExprChecker{}(x.typedAssignment);
+  }
+};
+
+template <bool IsCUFKernelDo> class DeviceContextChecker {
+public:
+  explicit DeviceContextChecker(SemanticsContext &c) : context_{c} {}
+  void CheckSubprogram(const parser::Name &name, const parser::Block &body) {
+    if (name.symbol) {
+      const auto *subp{
+          name.symbol->GetUltimate().detailsIf<SubprogramDetails>()};
+      if (subp && subp->moduleInterface()) {
+        subp = subp->moduleInterface()
+                   ->GetUltimate()
+                   .detailsIf<SubprogramDetails>();
+      }
+      if (subp &&
+          subp->cudaSubprogramAttrs().value_or(
+              common::CUDASubprogramAttrs::Host) !=
+              common::CUDASubprogramAttrs::Host) {
+        Check(body);
+      }
+    }
+  }
+  void Check(const parser::Block &block) {
+    for (const auto &epc : block) {
+      Check(epc);
+    }
+  }
+
+private:
+  void Check(const parser::ExecutionPartConstruct &epc) {
+    common::visit(
+        common::visitors{
+            [&](const parser::ExecutableConstruct &x) { Check(x); },
+            [&](const parser::Statement<common::Indirection<parser::EntryStmt>>
+                    &x) {
+              context_.Say(x.source,
+                  "Device code may not contain an ENTRY statement"_err_en_US);
+            },
+            [](const parser::Statement<common::Indirection<parser::FormatStmt>>
+                    &) {},
+            [](const parser::Statement<common::Indirection<parser::DataStmt>>
+                    &) {},
+            [](const parser::Statement<
+                common::Indirection<parser::NamelistStmt>> &) {},
+            [](const parser::ErrorRecovery &) {},
+        },
+        epc.u);
+  }
+  void Check(const parser::ExecutableConstruct &ec) {
+    common::visit(
+        common::visitors{
+            [&](const parser::Statement<parser::ActionStmt> &stmt) {
+              Check(stmt.statement, stmt.source);
+            },
+            [&](const common::Indirection<parser::DoConstruct> &x) {
+              if (const std::optional<parser::LoopControl> &control{
+                      x.value().GetLoopControl()}) {
+                common::visit([&](const auto &y) { Check(y); }, control->u);
+              }
+              Check(std::get<parser::Block>(x.value().t));
+            },
+            [&](const common::Indirection<parser::BlockConstruct> &x) {
+              Check(std::get<parser::Block>(x.value().t));
+            },
+            [&](const common::Indirection<parser::IfConstruct> &x) {
+              Check(x.value());
+            },
+            [&](const auto &x) {
+              if (auto source{parser::GetSource(x)}) {
+                context_.Say(*source,
+                    "Statement may not appear in device code"_err_en_US);
+              }
+            },
+        },
+        ec.u);
+  }
+  void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
+    common::visit(
+        common::visitors{
+            [&](const auto &x) {
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+                context_.Say(source, std::move(*msg));
+              }
+            },
+        },
+        stmt.u);
+  }
+  void Check(const parser::IfConstruct &ic) {
+    const auto &ifS{std::get<parser::Statement<parser::IfThenStmt>>(ic.t)};
+    CheckUnwrappedExpr(context_, ifS.source,
+        std::get<parser::ScalarLogicalExpr>(ifS.statement.t));
+    Check(std::get<parser::Block>(ic.t));
+    for (const auto &eib :
+        std::get<std::list<parser::IfConstruct::ElseIfBlock>>(ic.t)) {
+      const auto &eIfS{std::get<parser::Statement<parser::ElseIfStmt>>(eib.t)};
+      CheckUnwrappedExpr(context_, eIfS.source,
+          std::get<parser::ScalarLogicalExpr>(eIfS.statement.t));
+      Check(std::get<parser::Block>(eib.t));
+    }
+    if (const auto &eb{
+            std::get<std::optional<parser::IfConstruct::ElseBlock>>(ic.t)}) {
+      Check(std::get<parser::Block>(eb->t));
+    }
+  }
+  void Check(const parser::LoopControl::Bounds &bounds) {
+    Check(bounds.lower);
+    Check(bounds.upper);
+    if (bounds.step) {
+      Check(*bounds.step);
+    }
+  }
+  void Check(const parser::LoopControl::Concurrent &x) {
+    const auto &header{std::get<parser::ConcurrentHeader>(x.t)};
+    for (const auto &cc :
+        std::get<std::list<parser::ConcurrentControl>>(header.t)) {
+      Check(std::get<1>(cc.t));
+      Check(std::get<2>(cc.t));
+      if (const auto &step{
+              std::get<std::optional<parser::ScalarIntExpr>>(cc.t)}) {
+        Check(*step);
+      }
+    }
+    if (const auto &mask{
+            std::get<std::optional<parser::ScalarLogicalExpr>>(header.t)}) {
+      Check(*mask);
+    }
+  }
+  void Check(const parser::ScalarLogicalExpr &x) {
+    Check(DEREF(parser::Unwrap<parser::Expr>(x)));
+  }
+  void Check(const parser::ScalarIntExpr &x) {
+    Check(DEREF(parser::Unwrap<parser::Expr>(x)));
+  }
+  void Check(const parser::ScalarExpr &x) {
+    Check(DEREF(parser::Unwrap<parser::Expr>(x)));
+  }
+  void Check(const parser::Expr &expr) {
+    if (MaybeMsg msg{DeviceExprChecker{}(expr.typedExpr)}) {
+      context_.Say(expr.source, std::move(*msg));
+    }
+  }
+
+  SemanticsContext &context_;
+};
+
+void CUDAChecker::Enter(const parser::SubroutineSubprogram &x) {
+  DeviceContextChecker<false>{context_}.CheckSubprogram(
+      std::get<parser::Name>(
+          std::get<parser::Statement<parser::SubroutineStmt>>(x.t).statement.t),
+      std::get<parser::ExecutionPart>(x.t).v);
+}
+
+void CUDAChecker::Enter(const parser::FunctionSubprogram &x) {
+  DeviceContextChecker<false>{context_}.CheckSubprogram(
+      std::get<parser::Name>(
+          std::get<parser::Statement<parser::FunctionStmt>>(x.t).statement.t),
+      std::get<parser::ExecutionPart>(x.t).v);
+}
+
+void CUDAChecker::Enter(const parser::SeparateModuleSubprogram &x) {
+  DeviceContextChecker<false>{context_}.CheckSubprogram(
+      std::get<parser::Statement<parser::MpSubprogramStmt>>(x.t).statement.v,
+      std::get<parser::ExecutionPart>(x.t).v);
+}
+
+// !$CUF KERNEL DO semantic checks
+
+static int DoConstructTightNesting(
+    const parser::DoConstruct *doConstruct, const parser::Block *&innerBlock) {
+  if (!doConstruct || !doConstruct->IsDoNormal()) {
+    return 0;
+  }
+  innerBlock = &std::get<parser::Block>(doConstruct->t);
+  if (innerBlock->size() == 1) {
+    if (const auto *execConstruct{
+            std::get_if<parser::ExecutableConstruct>(&innerBlock->front().u)}) {
+      if (const auto *next{
+              std::get_if<common::Indirection<parser::DoConstruct>>(
+                  &execConstruct->u)}) {
+        return 1 + DoConstructTightNesting(&next->value(), innerBlock);
+      }
+    }
+  }
+  return 1;
+}
+
+void CUDAChecker::Enter(const parser::CUFKernelDoConstruct &x) {
+  auto source{std::get<parser::CUFKernelDoConstruct::Directive>(x.t).source};
+  const auto &directive{std::get<parser::CUFKernelDoConstruct::Directive>(x.t)};
+  std::int64_t depth{1};
+  if (auto expr{AnalyzeExpr(context_,
+          std::get<std::optional<parser::ScalarIntConstantExpr>>(
+              directive.t))}) {
+    depth = evaluate::ToInt64(expr).value_or(0);
+    if (depth <= 0) {
+      context_.Say(source,
+          "!$CUF KERNEL DO (%jd): loop nesting depth must be positive"_err_en_US,
+          std::intmax_t{depth});
+      depth = 1;
+    }
+  }
+  const parser::DoConstruct *doConstruct{common::GetPtrFromOptional(
+      std::get<std::optional<parser::DoConstruct>>(x.t))};
+  const parser::Block *innerBlock{nullptr};
+  if (DoConstructTightNesting(doConstruct, innerBlock) < depth) {
+    context_.Say(source,
+        "!$CUF KERNEL DO (%jd) must be followed by a DO construct with tightly nested outer levels of counted DO loops"_err_en_US,
+        std::intmax_t{depth});
+  }
+  if (innerBlock) {
+    DeviceContextChecker<true>{context_}.Check(*innerBlock);
+  }
+}
+
+} // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-deallocate.cpp b/flang/lib/Semantics/check-deallocate.cpp
--- a/flang/lib/Semantics/check-deallocate.cpp
+++ b/flang/lib/Semantics/check-deallocate.cpp
@@ -19,10 +19,13 @@
 void DeallocateChecker::Leave(const parser::DeallocateStmt &deallocateStmt) {
   for (const parser::AllocateObject &allocateObject :
       std::get<std::list<parser::AllocateObject>>(deallocateStmt.t)) {
+    parser::CharBlock source;
+    const Symbol *symbol{nullptr};
     common::visit(
         common::visitors{
             [&](const parser::Name &name) {
-              auto const *symbol{name.symbol};
+              source = name.source;
+              symbol = name.symbol;
               if (context_.HasError(symbol)) {
                 // already reported an error
               } else if (!IsVariableName(*symbol)) {
@@ -58,9 +61,10 @@
             [&](const parser::StructureComponent &structureComponent) {
               // Only perform structureComponent checks if it was successfully
               // analyzed by expression analysis.
+              source = structureComponent.component.source;
+              symbol = structureComponent.component.symbol;
               if (const auto *expr{GetExpr(context_, allocateObject)}) {
-                if (const Symbol *symbol{structureComponent.component.symbol}) {
-                  auto source{structureComponent.component.source};
+                if (symbol) {
                   if (!IsAllocatableOrPointer(*symbol)) { // C932
                     context_.Say(source,
                         "Component in DEALLOCATE statement must have the ALLOCATABLE or POINTER attribute"_err_en_US);
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2740,7 +2740,7 @@
         Say(source,
             "'%s' already has another CUDA data attribute ('%s')"_err_en_US,
             symbol.name(),
-            common::EnumToString(*object->cudaDataAttr()).substr());
+            std::string{common::EnumToString(*object->cudaDataAttr())}.c_str());
       } else {
         object->set_cudaDataAttr(attr);
       }
@@ -7700,13 +7700,11 @@
 
 void ResolveNamesVisitor::UseCUDABuiltinNames() {
   if (FindCUDADeviceContext(&currScope())) {
-    if (const Scope * CUDABuiltins{context().GetCUDABuiltinsScope()}) {
-      for (const auto &[name, symbol] : *CUDABuiltins) {
-        if (!FindInScope(name)) {
-          auto &localSymbol{MakeSymbol(name)};
-          localSymbol.set_details(UseDetails{name, *symbol});
-          localSymbol.flags() = symbol->flags();
-        }
+    for (const auto &[name, symbol] : context().GetCUDABuiltinsScope()) {
+      if (!FindInScope(name)) {
+        auto &localSymbol{MakeSymbol(name)};
+        localSymbol.set_details(UseDetails{name, *symbol});
+        localSymbol.flags() = symbol->flags();
       }
     }
   }
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -16,6 +16,7 @@
 #include "check-arithmeticif.h"
 #include "check-case.h"
 #include "check-coarray.h"
+#include "check-cuda.h"
 #include "check-data.h"
 #include "check-deallocate.h"
 #include "check-declarations.h"
@@ -69,12 +70,13 @@
 // children are visited, Leave is called after. No two checkers may have the
 // same Enter or Leave function. Each checker must be constructible from
 // SemanticsContext and have BaseChecker as a virtual base class.
-template <typename... C> class SemanticsVisitor : public virtual C... {
+template <typename... C>
+class SemanticsVisitor : public virtual BaseChecker, public virtual C... {
 public:
-  using C::Enter...;
-  using C::Leave...;
   using BaseChecker::Enter;
   using BaseChecker::Leave;
+  using C::Enter...;
+  using C::Leave...;
   SemanticsVisitor(SemanticsContext &context)
       : C{context}..., context_{context} {}
 
@@ -158,12 +160,14 @@
 };
 
 using StatementSemanticsPass1 = ExprChecker;
-using StatementSemanticsPass2 = SemanticsVisitor<AccStructureChecker,
-    AllocateChecker, ArithmeticIfStmtChecker, AssignmentChecker, CaseChecker,
-    CoarrayChecker, DataChecker, DeallocateChecker, DoForallChecker,
-    IfStmtChecker, IoChecker, MiscChecker, NamelistChecker, NullifyChecker,
-    OmpStructureChecker, PurityChecker, ReturnStmtChecker,
-    SelectRankConstructChecker, SelectTypeChecker, StopChecker>;
+using StatementSemanticsPass2 = SemanticsVisitor<AllocateChecker,
+    ArithmeticIfStmtChecker, AssignmentChecker, CaseChecker, CoarrayChecker,
+    DataChecker, DeallocateChecker, DoForallChecker, IfStmtChecker, IoChecker,
+    MiscChecker, NamelistChecker, NullifyChecker, PurityChecker,
+    ReturnStmtChecker, SelectRankConstructChecker, SelectTypeChecker,
+    StopChecker>;
+using StatementSemanticsPass3 =
+    SemanticsVisitor<AccStructureChecker, OmpStructureChecker, CUDAChecker>;
 
 static bool PerformStatementSemantics(
     SemanticsContext &context, parser::Program &program) {
@@ -174,6 +178,11 @@
   StatementSemanticsPass1{context}.Walk(program);
   StatementSemanticsPass2 pass2{context};
   pass2.Walk(program);
+  if (context.languageFeatures().IsEnabled(common::LanguageFeature::OpenACC) ||
+      context.languageFeatures().IsEnabled(common::LanguageFeature::OpenMP) ||
+      context.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) {
+    StatementSemanticsPass3{context}.Walk(program);
+  }
   if (!context.AnyFatalError()) {
     pass2.CompileDataInitializationsIntoInitializers();
   }
@@ -476,11 +485,12 @@
   }
 }
 
-const Scope *SemanticsContext::GetCUDABuiltinsScope() {
-  if (!CUDABuiltinsScope_) {
-    CUDABuiltinsScope_ = GetBuiltinModule("__cuda_builtins");
+const Scope &SemanticsContext::GetCUDABuiltinsScope() {
+  if (!cudaBuiltinsScope_) {
+    cudaBuiltinsScope_ = GetBuiltinModule("__cuda_builtins");
+    CHECK(cudaBuiltinsScope_.value() != nullptr);
   }
-  return *CUDABuiltinsScope_;
+  return **cudaBuiltinsScope_;
 }
 
 void SemanticsContext::UsePPCFortranBuiltinsModule() {
@@ -525,6 +535,7 @@
       parser::CanonicalizeDo(program_) && // force line break
       CanonicalizeAcc(context_.messages(), program_) &&
       CanonicalizeOmp(context_.messages(), program_) &&
+      CanonicalizeCUDA(program_) &&
       PerformStatementSemantics(context_, program_) &&
       ModFileWriter{context_}.WriteAll();
 }
@@ -566,7 +577,7 @@
   if (scope.derivedTypeSpec()) {
     os << " instantiation of " << *scope.derivedTypeSpec();
   }
-  os << '\n';
+  os << " sourceRange=" << scope.sourceRange().size() << " bytes\n";
   ++indent;
   for (const auto &pair : scope) {
     const auto &symbol{*pair.second};
diff --git a/flang/module/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
--- a/flang/module/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -75,4 +75,23 @@
 
   intrinsic :: __builtin_compiler_options, __builtin_compiler_version
 
+  interface operator(==)
+    module procedure __builtin_c_ptr_eq
+  end interface
+  interface operator(/=)
+    module procedure __builtin_c_ptr_eq
+  end interface
+
+contains
+
+  elemental logical function __builtin_c_ptr_eq(x, y)
+    type(__builtin_c_ptr), intent(in) :: x, y
+    __builtin_c_ptr_eq = x%__address == y%__address
+  end function
+
+  elemental logical function __builtin_c_ptr_ne(x, y)
+    type(__builtin_c_ptr), intent(in) :: x, y
+    __builtin_c_ptr_ne = x%__address /= y%__address
+  end function
+
 end module
diff --git a/flang/module/iso_c_binding.f90 b/flang/module/iso_c_binding.f90
--- a/flang/module/iso_c_binding.f90
+++ b/flang/module/iso_c_binding.f90
@@ -15,7 +15,8 @@
     c_ptr => __builtin_c_ptr, &
     c_funptr => __builtin_c_funptr, &
     c_sizeof => sizeof, &
-    c_loc => __builtin_c_loc
+    c_loc => __builtin_c_loc, &
+    operator(==), operator(/=)
 
   type(c_ptr), parameter :: c_null_ptr = c_ptr(0)
   type(c_funptr), parameter :: c_null_funptr = c_funptr(0)
diff --git a/flang/test/Driver/compiler_options.f90 b/flang/test/Driver/compiler_options.f90
--- a/flang/test/Driver/compiler_options.f90
+++ b/flang/test/Driver/compiler_options.f90
@@ -1,6 +1,6 @@
 ! RUN: %flang -S -emit-llvm -o - %s | FileCheck %s
 ! Test communication of COMPILER_OPTIONS from flang-new to flang-new -fc1.
-! CHECK: [[OPTSVAR:@_QQcl\.[0-9a-f]+]] = internal constant [[[OPTSLEN:[0-9]+]] x i8] c"{{.*}}flang-new{{(\.exe)?}} -S -emit-llvm -o - {{.*}}compiler_options.f90"
+! CHECK: [[OPTSVAR:@_QQcl\.[0-9a-f]+]] = {{[a-z]+}} constant [[[OPTSLEN:[0-9]+]] x i8] c"{{.*}}flang-new{{(\.exe)?}} -S -emit-llvm -o - {{.*}}compiler_options.f90"
 program main
     use ISO_FORTRAN_ENV, only: compiler_options
     implicit none
diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF
--- a/flang/test/Parser/cuf-sanity-unparse.CUF
+++ b/flang/test/Parser/cuf-sanity-unparse.CUF
@@ -34,9 +34,9 @@
 !CHECK:   !$CUF KERNEL DO <<<1_4,(2_4,3_4),STREAM=1_4>>>
 !CHECK:   DO j=1_4,10_4
 !CHECK:   END DO
-!CHECK:    CALL globalsub<<<1_4,2_4>>>
-!CHECK:    CALL globalsub<<<1_4,2_4,3_4>>>
-!CHECK:    CALL globalsub<<<1_4,2_4,3_4,4_4>>>
+!CHECK:    CALL globalsub<<<1_4,2_4>>>()
+!CHECK:    CALL globalsub<<<1_4,2_4,3_4>>>()
+!CHECK:    CALL globalsub<<<1_4,2_4,3_4,4_4>>>()
 !CHECK:   ALLOCATE(pa(32_4), STREAM=1_4, PINNED=ispinned)
 !CHECK:  END SUBROUTINE
 !CHECK: END MODULE
diff --git a/flang/test/Semantics/cuf04.cuf b/flang/test/Semantics/cuf04.cuf
new file mode 100644
--- /dev/null
+++ b/flang/test/Semantics/cuf04.cuf
@@ -0,0 +1,24 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! CUDA Fortran section 2.5.6 restrictions
+module m
+ contains
+  attributes(device) subroutine devsubr(n)
+    integer, intent(in) :: n
+    !WARNING: 'x1' should not have the SAVE attribute or initialization in a device subprogram
+    real, save :: x1
+    !WARNING: 'x2' should not have the SAVE attribute or initialization in a device subprogram
+    real :: x2 = 1.
+    !ERROR: Device subprogram 'devsubr' cannot call itself
+    if (n > 0) call devsubr(n-1)
+  end subroutine
+  attributes(global) subroutine globsubr
+  end subroutine
+  subroutine boring
+  end subroutine
+  subroutine test
+    !ERROR: 'globsubr' is a kernel subroutine and must be called with kernel launch parameters in chevrons
+    call globsubr
+    !ERROR: Kernel launch parameters in chevrons may not be used unless calling a kernel subroutine
+    call boring<<<1,2>>>
+  end subroutine
+end module
diff --git a/flang/test/Semantics/cuf05.cuf b/flang/test/Semantics/cuf05.cuf
new file mode 100644
--- /dev/null
+++ b/flang/test/Semantics/cuf05.cuf
@@ -0,0 +1,19 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols %s 2>&1 | FileCheck --dump-input-context=500 %s
+!CHECK: Global scope: size=0 alignment=1 sourceRange=0 bytes
+!CHECK: IntrinsicModules scope: size=0 alignment=1 sourceRange=0 bytes
+!CHECK: Module scope: __fortran_builtins
+!CHECK: Module scope: __cuda_builtins size=0 alignment=1
+!CHECK: Module scope: __fortran_type_info
+!CHECK: Module scope: m size=0 alignment=1
+!CHECK: Subprogram scope: devsubr size=0 alignment=1
+module m
+  implicit none
+ contains
+  attributes(device) subroutine devsubr()
+  !CHECK: blockdim: Use from blockdim in __cuda_builtins
+  !CHECK: blockidx: Use from blockidx in __cuda_builtins
+  !CHECK: griddim: Use from griddim in __cuda_builtins
+  !CHECK: threadidx: Use from threadidx in __cuda_builtins
+  !CHECK: warpsize: Use from warpsize in __cuda_builtins
+  end subroutine
+end module
diff --git a/flang/test/Semantics/cuf06.cuf b/flang/test/Semantics/cuf06.cuf
new file mode 100644
--- /dev/null
+++ b/flang/test/Semantics/cuf06.cuf
@@ -0,0 +1,15 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+  use, intrinsic :: __fortran_builtins, only: __builtin_dim3
+ contains
+  attributes(global) subroutine kernel
+  end subroutine
+  subroutine test
+    call kernel<<< 1, 32 >>> ! ok
+    call kernel<<< __builtin_dim3(1,1), __builtin_dim3(32,1,1) >>> ! ok
+    !ERROR: Kernel launch grid parameter must be either integer or TYPE(dim3)
+    call kernel<<< 1.d0, 32 >>>
+    !ERROR: Kernel launch block parameter must be either integer or TYPE(dim3)
+    call kernel<<< 1, "abc" >>>
+  end
+end module
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
new file mode 100644
--- /dev/null
+++ b/flang/test/Semantics/cuf09.cuf
@@ -0,0 +1,76 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+module m
+ contains
+  attributes(device) subroutine devsub
+    !ERROR: Statement may not appear in device code
+    !$cuf kernel do <<< 1, 2 >>>
+    do k=1,10
+    end do
+  end
+end
+
+program main
+  !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do <<< 1, 2 >>>
+  do while (.false.)
+  end do
+  !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do <<< 1, 2 >>>
+  do
+    exit
+  end do
+  !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do <<< 1, 2 >>>
+  do concurrent (j=1:10)
+  end do
+  !$cuf kernel do <<< 1, 2 >>>
+  do 1 j=1,10
+1 continue ! ok
+  !$cuf kernel do <<< 1, 2 >>>
+  do j=1,10
+  end do ! ok
+  !$cuf kernel do <<< 1, 2 >>>
+  do j=1,10
+    !ERROR: Statement may not appear in device code
+    !$cuf kernel do <<< 1, 2 >>>
+    do k=1,10
+    end do
+  end do
+  !ERROR: !$CUF KERNEL DO (-1): loop nesting depth must be positive
+  !$cuf kernel do (-1) <<< 1, 2 >>>
+  do j=1,10
+  end do
+  !ERROR: !$CUF KERNEL DO (1) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do <<< 1, 2 >>>
+  continue
+  !ERROR: !$CUF KERNEL DO (2) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do (2) <<< 1, 2 >>>
+  do j=1,10
+  end do
+  !ERROR: !$CUF KERNEL DO (2) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do (2) <<< 1, 2 >>>
+  do j=1,10
+    continue
+  end do
+  !ERROR: !$CUF KERNEL DO (2) must be followed by a DO construct with tightly nested outer levels of counted DO loops
+  !$cuf kernel do (2) <<< 1, 2 >>>
+  do j=1,10
+    do k=1,10
+    end do
+    continue
+  end do
+  !$cuf kernel do <<< 1, 2 >>>
+  do j = 1, 10
+    !ERROR: 'foo' may not be called in device code
+    call foo
+    !ERROR: 'bar' may not be called in device code
+    x = bar()
+    !ERROR: 'ifunc' may not be called in device code
+    if (ifunc() /= 0) continue
+    !ERROR: 'ifunc' may not be called in device code
+    if (ifunc() /= 0) then
+    !ERROR: 'ifunc' may not be called in device code
+    else if (ifunc() /= 1) then
+    end if
+  end do
+end
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -10,6 +10,7 @@
   "__fortran_type_info"
   "__fortran_ppc_types"
   "__fortran_ppc_intrinsics"
+  "__cuda_builtins"
   "ieee_arithmetic"
   "ieee_exceptions"
   "ieee_features"
diff --git a/flang/unittests/Runtime/Time.cpp b/flang/unittests/Runtime/Time.cpp
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang/unittests/Runtime/Time.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef __clang__ // 16.0.3 lacks <charconv>
+
 #include "gtest/gtest.h"
 #include "flang/Runtime/time-intrinsic.h"
 #include <algorithm>
@@ -166,3 +168,4 @@
     EXPECT_LE(minutes, 59);
   }
 }
+#endif // __clang__