Index: flang/include/flang/Semantics/scope.h
===================================================================
--- flang/include/flang/Semantics/scope.h
+++ flang/include/flang/Semantics/scope.h
@@ -68,7 +68,7 @@
   explicit Scope(SemanticsContext &context)
       : Scope{*this, Kind::Global, nullptr, context} {}
   Scope(Scope &parent, Kind kind, Symbol *symbol, SemanticsContext &context)
-      : parent_{parent}, kind_{kind}, symbol_{symbol}, context_{context} {
+      : parent_{&parent}, kind_{kind}, symbol_{symbol}, context_{context} {
     if (symbol) {
       symbol->set_scope(this);
     }
@@ -79,12 +79,12 @@
   bool operator!=(const Scope &that) const { return this != &that; }
 
   Scope &parent() {
-    CHECK(&parent_ != this);
-    return parent_;
+    CHECK(parent_ != this);
+    return *parent_;
   }
   const Scope &parent() const {
-    CHECK(&parent_ != this);
-    return parent_;
+    CHECK(parent_ != this);
+    return *parent_;
   }
   Kind kind() const { return kind_; }
   bool IsGlobal() const { return kind_ == Kind::Global; }
@@ -121,6 +121,7 @@
   bool Contains(const Scope &) const;
   /// Make a scope nested in this one
   Scope &MakeScope(Kind kind, Symbol *symbol = nullptr);
+
   SemanticsContext &GetMutableSemanticsContext() const {
     return const_cast<SemanticsContext &>(context());
   }
@@ -271,7 +272,8 @@
   }
 
 private:
-  Scope &parent_; // this is enclosing scope, not extended derived type base
+  Scope *parent_{
+      nullptr}; // this is enclosing scope, not extended derived type base
   const Kind kind_;
   std::size_t size_{0}; // size in bytes
   std::optional<std::size_t> alignment_; // required alignment in bytes
Index: flang/include/flang/Semantics/semantics.h
===================================================================
--- flang/include/flang/Semantics/semantics.h
+++ flang/include/flang/Semantics/semantics.h
@@ -215,7 +215,7 @@
   // Defines builtinsScope_ from the __Fortran_builtins module
   void UseFortranBuiltinsModule();
   const Scope *GetBuiltinsScope() const { return builtinsScope_; }
-
+  const Scope *GetCUDABuiltinsScope();
   void UsePPCFortranBuiltinsModule();
   const Scope *GetPPCBuiltinsScope() const { return ppcBuiltinsScope_; }
 
@@ -253,7 +253,7 @@
   void CheckError(const Symbol &);
 
   const common::IntrinsicTypeDefaultKinds &defaultKinds_;
-  const common::LanguageFeatureControl languageFeatures_;
+  const common::LanguageFeatureControl &languageFeatures_;
   parser::AllCookedSources &allCookedSources_;
   std::optional<parser::CharBlock> location_;
   std::vector<std::string> searchDirectories_;
@@ -279,6 +279,7 @@
   UnorderedSymbolSet errorSymbols_;
   std::set<std::string> tempNames_;
   const Scope *builtinsScope_{nullptr}; // module __Fortran_builtins
+  std::optional<const Scope *> CUDABuiltinsScope_; // module __CUDA_builtins
   const Scope *ppcBuiltinsScope_{nullptr}; // module __Fortran_PPC_intrinsics
   std::list<parser::Program> modFileParseTrees_;
   std::unique_ptr<CommonBlockMap> commonBlockMap_;
Index: flang/include/flang/Semantics/symbol.h
===================================================================
--- flang/include/flang/Semantics/symbol.h
+++ flang/include/flang/Semantics/symbol.h
@@ -114,6 +114,26 @@
   }
   bool defaultIgnoreTKR() const { return defaultIgnoreTKR_; }
   void set_defaultIgnoreTKR(bool yes) { defaultIgnoreTKR_ = yes; }
+  std::optional<common::CUDASubprogramAttrs> cudaSubprogramAttrs() const {
+    return cudaSubprogramAttrs_;
+  }
+  void set_cudaSubprogramAttrs(common::CUDASubprogramAttrs csas) {
+    cudaSubprogramAttrs_ = csas;
+  }
+  std::vector<std::int64_t> &cudaLaunchBounds() { return cudaLaunchBounds_; }
+  const std::vector<std::int64_t> &cudaLaunchBounds() const {
+    return cudaLaunchBounds_;
+  }
+  void set_cudaLaunchBounds(std::vector<std::int64_t> &&x) {
+    cudaLaunchBounds_ = std::move(x);
+  }
+  std::vector<std::int64_t> &cudaClusterDims() { return cudaClusterDims_; }
+  const std::vector<std::int64_t> &cudaClusterDims() const {
+    return cudaClusterDims_;
+  }
+  void set_cudaClusterDims(std::vector<std::int64_t> &&x) {
+    cudaClusterDims_ = std::move(x);
+  }
 
 private:
   bool isInterface_{false}; // true if this represents an interface-body
@@ -127,6 +147,10 @@
   // appeared in an ancestor (sub)module.
   Symbol *moduleInterface_{nullptr};
   bool defaultIgnoreTKR_{false};
+  // CUDA ATTRIBUTES(...) from subroutine/function prefix
+  std::optional<common::CUDASubprogramAttrs> cudaSubprogramAttrs_;
+  // CUDA LANCH_BOUNDS(...) & CLUSTER_DIMS(...) from prefix
+  std::vector<std::int64_t> cudaLaunchBounds_, cudaClusterDims_;
 
   friend llvm::raw_ostream &operator<<(
       llvm::raw_ostream &, const SubprogramDetails &);
@@ -229,6 +253,12 @@
   bool CanBeDeferredShape() const { return shape_.CanBeDeferredShape(); }
   bool IsAssumedSize() const { return isDummy() && shape_.CanBeAssumedSize(); }
   bool IsAssumedRank() const { return isDummy() && shape_.IsAssumedRank(); }
+  std::optional<common::CUDADataAttr> cudaDataAttr() const {
+    return cudaDataAttr_;
+  }
+  void set_cudaDataAttr(std::optional<common::CUDADataAttr> attr) {
+    cudaDataAttr_ = attr;
+  }
 
 private:
   MaybeExpr init_;
@@ -237,6 +267,7 @@
   ArraySpec coshape_;
   common::IgnoreTKRSet ignoreTKR_;
   const Symbol *commonBlock_{nullptr}; // common block this object is in
+  std::optional<common::CUDADataAttr> cudaDataAttr_;
   friend llvm::raw_ostream &operator<<(
       llvm::raw_ostream &, const ObjectEntityDetails &);
 };
@@ -276,10 +307,13 @@
   std::optional<const Symbol *> init() const { return init_; }
   void set_init(const Symbol &symbol) { init_ = &symbol; }
   void set_init(std::nullptr_t) { init_ = nullptr; }
+  bool isCUDAKernel() const { return isCUDAKernel_; }
+  void set_isCUDAKernel(bool yes = true) { isCUDAKernel_ = yes; }
 
 private:
   const Symbol *procInterface_{nullptr};
   std::optional<const Symbol *> init_;
+  bool isCUDAKernel_{false};
   friend llvm::raw_ostream &operator<<(
       llvm::raw_ostream &, const ProcEntityDetails &);
 };
Index: flang/include/flang/Semantics/tools.h
===================================================================
--- flang/include/flang/Semantics/tools.h
+++ flang/include/flang/Semantics/tools.h
@@ -190,6 +190,22 @@
 bool IsUnlimitedPolymorphic(const Symbol &);
 bool IsPolymorphicAllocatable(const Symbol &);
 
+inline bool IsCUDADeviceContext(const Scope *scope) {
+  if (scope) {
+    if (const Symbol * symbol{scope->symbol()}) {
+      if (const auto *subp{symbol->detailsIf<SubprogramDetails>()}) {
+        if (auto attrs{subp->cudaSubprogramAttrs()}) {
+          return *attrs != common::CUDASubprogramAttrs::Host;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+const Scope *FindCUDADeviceContext(const Scope *);
+std::optional<common::CUDADataAttr> GetCUDADataAttr(const Symbol *);
+
 // Return an error if a symbol is not accessible from a scope
 std::optional<parser::MessageFormattedText> CheckAccessibleSymbol(
     const semantics::Scope &, const Symbol &);
@@ -386,9 +402,9 @@
 //     its non-POINTER derived type components.  (The lifetime of each
 //     potential subobject component is that of the entire instance.)
 //   - PotentialAndPointer subobject components of a derived type are the
-//   closure of
-//     its components (including POINTERs) and the PotentialAndPointer subobject
-//     components of its non-POINTER derived type components.
+//     closure of its components (including POINTERs) and the
+//     PotentialAndPointer subobject components of its non-POINTER derived type
+//     components.
 // Parent and procedure components are considered against these definitions.
 // For this kind of iterator, the component tree is recursively visited in the
 // following order:
Index: flang/lib/Semantics/mod-file.cpp
===================================================================
--- flang/lib/Semantics/mod-file.cpp
+++ flang/lib/Semantics/mod-file.cpp
@@ -457,6 +457,31 @@
     os << (isAbstract ? "abstract " : "") << "interface\n";
   }
   PutAttrs(os, prefixAttrs, nullptr, false, ""s, " "s);
+  if (auto attrs{details.cudaSubprogramAttrs()}) {
+    if (*attrs == common::CUDASubprogramAttrs::HostDevice) {
+      os << "attributes(host,device) ";
+    } else {
+      PutLower(os << "attributes(", common::EnumToString(*attrs)) << ") ";
+    }
+    if (!details.cudaLaunchBounds().empty()) {
+      os << "launch_bounds";
+      char sep{'('};
+      for (auto x : details.cudaLaunchBounds()) {
+        os << sep << x;
+        sep = ',';
+      }
+      os << ") ";
+    }
+    if (!details.cudaClusterDims().empty()) {
+      os << "cluster_dims";
+      char sep{'('};
+      for (auto x : details.cudaClusterDims()) {
+        os << sep << x;
+        sep = ',';
+      }
+      os << ") ";
+    }
+  }
   os << (details.isFunction() ? "function " : "subroutine ");
   os << symbol.name() << '(';
   int n = 0;
@@ -710,6 +735,10 @@
     });
     os << ") " << symbol.name() << '\n';
   }
+  if (auto attr{details.cudaDataAttr()}) {
+    PutLower(os << "attributes(", common::EnumToString(*attr))
+        << ") " << symbol.name() << '\n';
+  }
 }
 
 void ModFileWriter::PutProcEntity(llvm::raw_ostream &os, const Symbol &symbol) {
@@ -990,6 +1019,7 @@
   options.isModuleFile = true;
   options.features.Enable(common::LanguageFeature::BackslashEscapes);
   options.features.Enable(common::LanguageFeature::OpenMP);
+  options.features.Enable(common::LanguageFeature::CUDA);
   if (!isIntrinsic.value_or(false) && !notAModule) {
     // The search for this module file will scan non-intrinsic module
     // directories.  If a directory is in both the intrinsic and non-intrinsic
Index: flang/lib/Semantics/resolve-names.cpp
===================================================================
--- flang/lib/Semantics/resolve-names.cpp
+++ flang/lib/Semantics/resolve-names.cpp
@@ -238,6 +238,7 @@
 public:
   bool BeginAttrs(); // always returns true
   Attrs GetAttrs();
+  std::optional<common::CUDADataAttr> cudaDataAttr() { return cudaDataAttr_; }
   Attrs EndAttrs();
   bool SetPassNameOn(Symbol &);
   void SetBindNameOn(Symbol &);
@@ -278,9 +279,11 @@
   HANDLE_ATTR_CLASS(Value, VALUE)
   HANDLE_ATTR_CLASS(Volatile, VOLATILE)
 #undef HANDLE_ATTR_CLASS
+  bool Pre(const common::CUDADataAttr);
 
 protected:
   std::optional<Attrs> attrs_;
+  std::optional<common::CUDADataAttr> cudaDataAttr_;
 
   Attr AccessSpecToAttr(const parser::AccessSpec &x) {
     switch (x.v) {
@@ -419,7 +422,8 @@
 };
 
 // Track array specifications. They can occur in AttrSpec, EntityDecl,
-// ObjectDecl, DimensionStmt, CommonBlockObject, or BasedPointerStmt.
+// ObjectDecl, DimensionStmt, CommonBlockObject, BasedPointerStmt, and
+// ComponentDecl.
 // 1. INTEGER, DIMENSION(10) :: x
 // 2. INTEGER :: x(10)
 // 3. ALLOCATABLE :: x(:)
@@ -658,6 +662,8 @@
     symbol.attrs().set(attr);
     symbol.implicitAttrs().set(attr);
   }
+  void SetCUDADataAttr(
+      SourceName, Symbol &, std::optional<common::CUDADataAttr>);
 
 protected:
   FuncResultStack &funcResultStack() { return funcResultStack_; }
@@ -851,6 +857,9 @@
   void Post(const parser::InterfaceBody::Function &);
   bool Pre(const parser::Suffix &);
   bool Pre(const parser::PrefixSpec &);
+  bool Pre(const parser::PrefixSpec::Attributes &);
+  void Post(const parser::PrefixSpec::Launch_Bounds &);
+  void Post(const parser::PrefixSpec::Cluster_Dims &);
 
   bool BeginSubprogram(const parser::Name &, Symbol::Flag,
       bool hasModulePrefix = false,
@@ -922,6 +931,7 @@
     objectDeclAttr_ = Attr::TARGET;
     return true;
   }
+  bool Pre(const parser::CUDAAttributesStmt &);
   void Post(const parser::TargetStmt &) { objectDeclAttr_ = std::nullopt; }
   void Post(const parser::DimensionStmt::Declaration &);
   void Post(const parser::CodimensionDecl &);
@@ -1529,7 +1539,8 @@
     llvm_unreachable("This node is handled in ProgramUnit");
   }
 
-  void NoteExecutablePartCall(Symbol::Flag, const parser::Call &);
+  void NoteExecutablePartCall(
+      Symbol::Flag, const parser::Call &, bool hasCUDAChevrons);
 
   friend void ResolveSpecificationParts(SemanticsContext &, const Symbol &);
 
@@ -1557,6 +1568,7 @@
   void FinishSpecificationParts(const ProgramTree &);
   void FinishDerivedTypeInstantiation(Scope &);
   void ResolveExecutionParts(const ProgramTree &);
+  void UseCUDABuiltinNames();
 };
 
 // ImplicitRules implementation
@@ -1662,8 +1674,8 @@
 // AttrsVisitor implementation
 
 bool AttrsVisitor::BeginAttrs() {
-  CHECK(!attrs_);
-  attrs_ = std::make_optional<Attrs>();
+  CHECK(!attrs_ && !cudaDataAttr_);
+  attrs_ = Attrs{};
   return true;
 }
 Attrs AttrsVisitor::GetAttrs() {
@@ -1673,6 +1685,7 @@
 Attrs AttrsVisitor::EndAttrs() {
   Attrs result{GetAttrs()};
   attrs_.reset();
+  cudaDataAttr_.reset();
   passName_ = std::nullopt;
   bindName_.reset();
   return result;
@@ -1789,6 +1802,15 @@
   attrs_->set(attrName);
   return true;
 }
+bool AttrsVisitor::Pre(const common::CUDADataAttr x) {
+  if (cudaDataAttr_.value_or(x) != x) {
+    Say(currStmtSource().value(),
+        "CUDA data attributes '%s' and '%s' may not both be specified"_err_en_US,
+        common::EnumToString(*cudaDataAttr_), common::EnumToString(x));
+  }
+  cudaDataAttr_ = x;
+  return false;
+}
 
 // DeclTypeSpecVisitor implementation
 
@@ -2688,6 +2710,27 @@
   return ok;
 }
 
+void ScopeHandler::SetCUDADataAttr(SourceName source, Symbol &symbol,
+    std::optional<common::CUDADataAttr> attr) {
+  if (attr) {
+    ConvertToObjectEntity(symbol);
+    if (auto *object{symbol.detailsIf<ObjectEntityDetails>()}) {
+      if (*attr != object->cudaDataAttr().value_or(*attr)) {
+        Say(source,
+            "'%s' already has another CUDA data attribute ('%s')"_err_en_US,
+            symbol.name(),
+            common::EnumToString(*object->cudaDataAttr()).substr());
+      } else {
+        object->set_cudaDataAttr(attr);
+      }
+    } else {
+      Say(source,
+          "'%s' is not an object and may not have a CUDA data attribute"_err_en_US,
+          symbol.name());
+    }
+  }
+}
+
 // ModuleVisitor implementation
 
 bool ModuleVisitor::Pre(const parser::Only &x) {
@@ -3424,17 +3467,100 @@
     if (info.parsedType) { // C1543
       Say(currStmtSource().value(),
           "FUNCTION prefix cannot specify the type more than once"_err_en_US);
-      return false;
     } else {
       info.parsedType = parsedType;
       info.source = currStmtSource();
-      return false;
     }
+    return false;
   } else {
     return true;
   }
 }
 
+bool SubprogramVisitor::Pre(const parser::PrefixSpec::Attributes &attrs) {
+  if (auto *subp{currScope().symbol()
+              ? currScope().symbol()->detailsIf<SubprogramDetails>()
+              : nullptr}) {
+    for (auto attr : attrs.v) {
+      if (auto current{subp->cudaSubprogramAttrs()}) {
+        if (attr == *current ||
+            (*current == common::CUDASubprogramAttrs::HostDevice &&
+                (attr == common::CUDASubprogramAttrs::Host ||
+                    attr == common::CUDASubprogramAttrs::Device))) {
+          Say(currStmtSource().value(),
+              "ATTRIBUTES(%s) appears more than once"_warn_en_US,
+              common::EnumToString(attr));
+        } else if ((attr == common::CUDASubprogramAttrs::Host ||
+                       attr == common::CUDASubprogramAttrs::Device) &&
+            (*current == common::CUDASubprogramAttrs::Host ||
+                *current == common::CUDASubprogramAttrs::Device ||
+                *current == common::CUDASubprogramAttrs::HostDevice)) {
+          // HOST,DEVICE or DEVICE,HOST -> HostDevice
+          subp->set_cudaSubprogramAttrs(
+              common::CUDASubprogramAttrs::HostDevice);
+        } else {
+          Say(currStmtSource().value(),
+              "ATTRIBUTES(%s) conflicts with earlier ATTRIBUTES(%s)"_err_en_US,
+              common::EnumToString(attr), common::EnumToString(*current));
+        }
+      } else {
+        subp->set_cudaSubprogramAttrs(attr);
+      }
+    }
+  }
+  return false;
+}
+
+void SubprogramVisitor::Post(const parser::PrefixSpec::Launch_Bounds &x) {
+  std::vector<std::int64_t> bounds;
+  bool ok{true};
+  for (const auto &sicx : x.v) {
+    if (auto value{evaluate::ToInt64(EvaluateExpr(sicx))}) {
+      bounds.push_back(*value);
+    } else {
+      ok = false;
+    }
+  }
+  if (!ok || bounds.size() != 2) {
+    Say(currStmtSource().value(),
+        "Operands of LAUNCH_BOUNDS() must be two integer constants"_err_en_US);
+  } else if (auto *subp{currScope().symbol()
+                     ? currScope().symbol()->detailsIf<SubprogramDetails>()
+                     : nullptr}) {
+    if (subp->cudaLaunchBounds().empty()) {
+      subp->set_cudaLaunchBounds(std::move(bounds));
+    } else {
+      Say(currStmtSource().value(),
+          "LAUNCH_BOUNDS() may only appear once"_err_en_US);
+    }
+  }
+}
+
+void SubprogramVisitor::Post(const parser::PrefixSpec::Cluster_Dims &x) {
+  std::vector<std::int64_t> dims;
+  bool ok{true};
+  for (const auto &sicx : x.v) {
+    if (auto value{evaluate::ToInt64(EvaluateExpr(sicx))}) {
+      dims.push_back(*value);
+    } else {
+      ok = false;
+    }
+  }
+  if (!ok || dims.size() != 3) {
+    Say(currStmtSource().value(),
+        "Operands of CLUSTER_DIMS() must be three integer constants"_err_en_US);
+  } else if (auto *subp{currScope().symbol()
+                     ? currScope().symbol()->detailsIf<SubprogramDetails>()
+                     : nullptr}) {
+    if (subp->cudaClusterDims().empty()) {
+      subp->set_cudaClusterDims(std::move(dims));
+    } else {
+      Say(currStmtSource().value(),
+          "CLUSTER_DIMS() may only appear once"_err_en_US);
+    }
+  }
+}
+
 bool SubprogramVisitor::Pre(const parser::InterfaceBody::Subroutine &x) {
   const auto &name{std::get<parser::Name>(
       std::get<parser::Statement<parser::SubroutineStmt>>(x.t).statement.t)};
@@ -3766,6 +3892,7 @@
   }
   SubprogramDetails &entryDetails{entrySymbol.get<SubprogramDetails>()};
   CHECK(entryDetails.entryScope() == &inclusiveScope);
+  SetCUDADataAttr(name.source, entrySymbol, cudaDataAttr());
   entrySymbol.attrs() |= GetAttrs();
   SetBindNameOn(entrySymbol);
   for (const auto &dummyArg : std::get<std::list<parser::DummyArg>>(stmt.t)) {
@@ -4150,6 +4277,7 @@
   Attrs attrs{attrs_ ? HandleSaveName(name.source, *attrs_) : Attrs{}};
   Symbol &symbol{DeclareUnknownEntity(name, attrs)};
   symbol.ReplaceName(name.source);
+  SetCUDADataAttr(name.source, symbol, cudaDataAttr());
   if (const auto &init{std::get<std::optional<parser::Initialization>>(x.t)}) {
     ConvertToObjectEntity(symbol) || ConvertToProcEntity(symbol);
     symbol.set(
@@ -4422,6 +4550,23 @@
 bool DeclarationVisitor::Pre(const parser::VolatileStmt &x) {
   return HandleAttributeStmt(Attr::VOLATILE, x.v);
 }
+bool DeclarationVisitor::Pre(const parser::CUDAAttributesStmt &x) {
+  auto attr{std::get<common::CUDADataAttr>(x.t)};
+  for (const auto &name : std::get<std::list<parser::Name>>(x.t)) {
+    auto *symbol{FindInScope(name)};
+    if (symbol && symbol->has<UseDetails>()) {
+      Say(currStmtSource().value(),
+          "Cannot apply CUDA data attribute to use-associated '%s'"_err_en_US,
+          name.source);
+    } else {
+      if (!symbol) {
+        symbol = &MakeSymbol(name, ObjectEntityDetails{});
+      }
+      SetCUDADataAttr(name.source, *symbol, attr);
+    }
+  }
+  return false;
+}
 // Handle a statement that sets an attribute on a list of names.
 bool DeclarationVisitor::HandleAttributeStmt(
     Attr attr, const std::list<parser::Name> &names) {
@@ -4972,6 +5117,7 @@
   }
   if (OkToAddComponent(name)) {
     auto &symbol{DeclareObjectEntity(name, attrs)};
+    SetCUDADataAttr(name.source, symbol, cudaDataAttr());
     if (symbol.has<ObjectEntityDetails>()) {
       if (auto &init{std::get<std::optional<parser::Initialization>>(x.t)}) {
         Initialization(name, *init, true);
@@ -5079,6 +5225,7 @@
     attrs.set(Attr::EXTERNAL);
   }
   Symbol &symbol{DeclareProcEntity(name, attrs, procInterface)};
+  SetCUDADataAttr(name.source, symbol, cudaDataAttr()); // for error
   symbol.ReplaceName(name.source);
   if (dtDetails) {
     dtDetails->add_component(symbol);
@@ -6057,6 +6204,7 @@
       attrs.set(Attr::PRIVATE);
     }
     Symbol &result{MakeSymbol(name, attrs, std::move(details))};
+    SetCUDADataAttr(name, result, cudaDataAttr());
     if (result.has<TypeParamDetails>()) {
       derivedType.symbol()->get<DerivedTypeDetails>().add_paramDecl(result);
     }
@@ -6718,6 +6866,7 @@
 }
 bool ResolveNamesVisitor::Pre(const parser::CallStmt &x) {
   HandleCall(Symbol::Flag::Subroutine, x.call);
+  Walk(x.chevrons);
   return false;
 }
 
@@ -7233,7 +7382,7 @@
 // of the subprogram's interface, and to mark as procedures any symbols
 // that might otherwise have been miscategorized as objects.
 void ResolveNamesVisitor::NoteExecutablePartCall(
-    Symbol::Flag flag, const parser::Call &call) {
+    Symbol::Flag flag, const parser::Call &call, bool hasCUDAChevrons) {
   auto &designator{std::get<parser::ProcedureDesignator>(call.t)};
   if (const auto *name{std::get_if<parser::Name>(&designator.u)}) {
     // Subtlety: The symbol pointers in the parse tree are not set, because
@@ -7245,12 +7394,15 @@
               : Symbol::Flag::Subroutine};
       if (!symbol->test(other)) {
         ConvertToProcEntity(*symbol);
-        if (symbol->has<ProcEntityDetails>()) {
+        if (auto *details{symbol->detailsIf<ProcEntityDetails>()}) {
           symbol->set(flag);
           if (IsDummy(*symbol)) {
             SetImplicitAttr(*symbol, Attr::EXTERNAL);
           }
           ApplyImplicitRules(*symbol);
+          if (hasCUDAChevrons) {
+            details->set_isCUDAKernel();
+          }
         }
       }
     }
@@ -7378,6 +7530,7 @@
   Walk(ompDecls);
   Walk(compilerDirectives);
   Walk(useStmts);
+  UseCUDABuiltinNames();
   ClearUseRenames();
   ClearUseOnly();
   ClearModuleUses();
@@ -7394,6 +7547,20 @@
   return false;
 }
 
+void ResolveNamesVisitor::UseCUDABuiltinNames() {
+  if (FindCUDADeviceContext(&currScope())) {
+    if (const Scope * CUDABuiltins{context().GetCUDABuiltinsScope()}) {
+      for (const auto &[name, symbol] : *CUDABuiltins) {
+        if (!FindInScope(name)) {
+          auto &localSymbol{MakeSymbol(name)};
+          localSymbol.set_details(UseDetails{name, *symbol});
+          localSymbol.flags() = symbol->flags();
+        }
+      }
+    }
+  }
+}
+
 // Initial processing on specification constructs, before visiting them.
 void ResolveNamesVisitor::PreSpecificationConstruct(
     const parser::SpecificationConstruct &spec) {
@@ -7931,10 +8098,11 @@
   template <typename A> bool Pre(const A &) { return true; }
   template <typename A> void Post(const A &) {}
   void Post(const parser::FunctionReference &fr) {
-    resolver_.NoteExecutablePartCall(Symbol::Flag::Function, fr.v);
+    resolver_.NoteExecutablePartCall(Symbol::Flag::Function, fr.v, false);
   }
   void Post(const parser::CallStmt &cs) {
-    resolver_.NoteExecutablePartCall(Symbol::Flag::Subroutine, cs.call);
+    resolver_.NoteExecutablePartCall(
+        Symbol::Flag::Subroutine, cs.call, cs.chevrons.has_value());
   }
 
 private:
@@ -8241,6 +8409,7 @@
 void ResolveNamesVisitor::Post(const parser::Program &) {
   // ensure that all temps were deallocated
   CHECK(!attrs_);
+  CHECK(!cudaDataAttr_);
   CHECK(!GetDeclTypeSpec());
 }
 
Index: flang/lib/Semantics/scope.cpp
===================================================================
--- flang/lib/Semantics/scope.cpp
+++ flang/lib/Semantics/scope.cpp
@@ -92,7 +92,7 @@
     const Scope *parent{symbol_->get<ModuleDetails>().parent()};
     return parent ? parent->FindSymbol(name) : nullptr;
   } else if (CanImport(name)) {
-    return parent_.FindSymbol(name);
+    return parent_->FindSymbol(name);
   } else {
     return nullptr;
   }
@@ -288,7 +288,7 @@
 
 // true if name can be imported or host-associated from parent scope.
 bool Scope::CanImport(const SourceName &name) const {
-  if (IsTopLevel() || parent_.IsTopLevel()) {
+  if (IsTopLevel() || parent_->IsTopLevel()) {
     return false;
   }
   switch (GetImportKind()) {
Index: flang/lib/Semantics/semantics.cpp
===================================================================
--- flang/lib/Semantics/semantics.cpp
+++ flang/lib/Semantics/semantics.cpp
@@ -470,6 +470,13 @@
   }
 }
 
+const Scope *SemanticsContext::GetCUDABuiltinsScope() {
+  if (!CUDABuiltinsScope_) {
+    CUDABuiltinsScope_ = GetBuiltinModule("__cuda_builtins");
+  }
+  return *CUDABuiltinsScope_;
+}
+
 void SemanticsContext::UsePPCFortranBuiltinsModule() {
   if (ppcBuiltinsScope_ == nullptr) {
     ppcBuiltinsScope_ = GetBuiltinModule("__fortran_ppc_intrinsics");
Index: flang/lib/Semantics/symbol.cpp
===================================================================
--- flang/lib/Semantics/symbol.cpp
+++ flang/lib/Semantics/symbol.cpp
@@ -128,6 +128,22 @@
   if (x.defaultIgnoreTKR_) {
     os << " defaultIgnoreTKR";
   }
+  if (x.cudaSubprogramAttrs_) {
+    os << " cudaSubprogramAttrs: "
+       << common::EnumToString(*x.cudaSubprogramAttrs_);
+  }
+  if (!x.cudaLaunchBounds_.empty()) {
+    os << " cudaLaunchBounds:";
+    for (auto x : x.cudaLaunchBounds_) {
+      os << ' ' << x;
+    }
+  }
+  if (!x.cudaClusterDims_.empty()) {
+    os << " cudaClusterDims:";
+    for (auto x : x.cudaClusterDims_) {
+      os << ' ' << x;
+    }
+  }
   return os;
 }
 
@@ -414,6 +430,9 @@
     os << ' ';
     x.ignoreTKR_.Dump(os, common::EnumToString);
   }
+  if (x.cudaDataAttr()) {
+    os << " cudaDataAttr: " << common::EnumToString(*x.cudaDataAttr());
+  }
   return os;
 }
 
@@ -443,6 +462,9 @@
       os << " => NULL()";
     }
   }
+  if (x.isCUDAKernel()) {
+    os << " isCUDAKernel";
+  }
   return os;
 }
 
Index: flang/lib/Semantics/tools.cpp
===================================================================
--- flang/lib/Semantics/tools.cpp
+++ flang/lib/Semantics/tools.cpp
@@ -1058,6 +1058,18 @@
   return IsAllocatable(symbol) && IsPolymorphic(symbol);
 }
 
+const Scope *FindCUDADeviceContext(const Scope *scope) {
+  return !scope ? nullptr : FindScopeContaining(*scope, [](const Scope &s) {
+    return IsCUDADeviceContext(&s);
+  });
+}
+
+std::optional<common::CUDADataAttr> GetCUDADataAttr(const Symbol *symbol) {
+  const auto *object{
+      symbol ? symbol->detailsIf<ObjectEntityDetails>() : nullptr};
+  return object ? object->cudaDataAttr() : std::nullopt;
+}
+
 std::optional<parser::MessageFormattedText> CheckAccessibleSymbol(
     const Scope &scope, const Symbol &symbol) {
   if (symbol.attrs().test(Attr::PRIVATE)) {
Index: flang/module/__fortran_builtins.f90
===================================================================
--- flang/module/__fortran_builtins.f90
+++ flang/module/__fortran_builtins.f90
@@ -43,6 +43,15 @@
   integer, parameter :: __builtin_atomic_int_kind = selected_int_kind(18)
   integer, parameter :: __builtin_atomic_logical_kind = __builtin_atomic_int_kind
 
+  procedure(type(__builtin_c_ptr)) :: __builtin_c_loc
+
+  type :: __builtin_dim3
+    integer :: x=1, y=1, z=1
+  end type
+  type(__builtin_dim3) :: &
+    __builtin_threadIdx, __builtin_blockDim, __builtin_blockIdx, __builtin_gridDim
+  integer, parameter :: __builtin_warpsize = 32
+
   intrinsic :: __builtin_ieee_is_nan, __builtin_ieee_is_negative, &
     __builtin_ieee_is_normal
   intrinsic :: __builtin_ieee_next_after, __builtin_ieee_next_down, &
Index: flang/test/Parser/cuf-sanity-common
===================================================================
--- flang/test/Parser/cuf-sanity-common
+++ flang/test/Parser/cuf-sanity-common
@@ -18,7 +18,7 @@
   attributes(global) launch_bounds(1, 2) subroutine lbsub; end
   attributes(global) cluster_dims(1, 2, 3) subroutine cdsub; end
   attributes(device) subroutine attrs
-! enable with name resolution:    attributes(device) :: devx1
+    attributes(device) :: devx1
     real, device :: devx2
   end subroutine
   subroutine test
Index: flang/test/Semantics/cuf01.cuf
===================================================================
--- /dev/null
+++ flang/test/Semantics/cuf01.cuf
@@ -0,0 +1,22 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Test conflicting CUDA subprogram attributes
+module m1
+ contains
+  !WARNING: ATTRIBUTES(Host) appears more than once
+  attributes(host,host) subroutine ok1; end
+  !WARNING: ATTRIBUTES(Host) appears more than once
+  attributes(host) attributes(host) subroutine ok2; end
+  attributes(host,device) subroutine ok3; end
+  attributes(device,host) subroutine ok4; end
+  !WARNING: ATTRIBUTES(Host) appears more than once
+  attributes(host,device,host) subroutine ok5; end
+  !WARNING: ATTRIBUTES(Device) appears more than once
+  attributes(device,host,device) subroutine ok6; end
+  !ERROR: ATTRIBUTES(Global) conflicts with earlier ATTRIBUTES(Host)
+  attributes(host,global) subroutine conflict1; end
+  !ERROR: ATTRIBUTES(Host) conflicts with earlier ATTRIBUTES(Global)
+  attributes(global,host) subroutine conflict2; end
+  !ERROR: ATTRIBUTES(Grid_Global) conflicts with earlier ATTRIBUTES(Host)
+  attributes(host,grid_global) subroutine conflict3; end
+  !TODO: more with launch_bounds & cluster_dims
+end module
Index: flang/test/Semantics/modfile55.cuf
===================================================================
--- /dev/null
+++ flang/test/Semantics/modfile55.cuf
@@ -0,0 +1,41 @@
+! RUN: %python %S/test_modfile.py %s %flang_fc1
+! Sanity check for CUDA Fortran attributes in module files
+module m
+  attributes(device) dd
+  real, managed, allocatable :: md
+  real, pinned, allocatable :: mp
+  attributes(constant) cd
+ contains
+  attributes(global) subroutine globsub(x,y,z)
+    real, value :: x
+    real, device :: y
+    real, managed :: z
+  end subroutine
+  attributes(host,device) real function foo(x)
+    foo = x + 1.
+  end function
+end
+
+!Expect: m.mod
+!module m
+!real(4)::dd
+!attributes(device) dd
+!real(4),allocatable::md
+!attributes(managed) md
+!real(4),allocatable::mp
+!attributes(pinned) mp
+!real(4)::cd
+!attributes(constant) cd
+!contains
+!attributes(global) subroutine globsub(x,y,z)
+!real(4),value::x
+!real(4)::y
+!attributes(device) y
+!real(4)::z
+!attributes(managed) z
+!end
+!attributes(host,device) function foo(x)
+!real(4)::x
+!real(4)::foo
+!end
+!end