diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2153,6 +2153,11 @@
   /// than implicitly __strong.
   bool isObjCARCImplicitlyUnretainedType() const;
 
+  /// Check if the type is the CUDA device builtin surface type.
+  bool isCUDADeviceBuiltinSurfaceType() const;
+  /// Check if the type is the CUDA device builtin texture type.
+  bool isCUDADeviceBuiltinTextureType() const;
+
   /// Return the implicit lifetime for this type, which must not be dependent.
   Qualifiers::ObjCLifetime getObjCARCImplicitLifetime() const;
 
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1044,16 +1044,20 @@
   let LangOpts = [CUDA];
 }
 
-def CUDADeviceBuiltinSurfaceType : IgnoredAttr {
+def CUDADeviceBuiltinSurfaceType : InheritableAttr {
   let Spellings = [GNU<"device_builtin_surface_type">,
                    Declspec<"__device_builtin_surface_type__">];
   let LangOpts = [CUDA];
+  let Subjects = SubjectList<[Type]>;
+  let Documentation = [CUDADeviceBuiltinSurfaceTypeDocs];
 }
 
-def CUDADeviceBuiltinTextureType : IgnoredAttr {
+def CUDADeviceBuiltinTextureType : InheritableAttr {
   let Spellings = [GNU<"device_builtin_texture_type">,
                    Declspec<"__device_builtin_texture_type__">];
   let LangOpts = [CUDA];
+  let Subjects = SubjectList<[Type]>;
+  let Documentation = [CUDADeviceBuiltinTextureTypeDocs];
 }
 
 def CUDAGlobal : InheritableAttr {
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4624,6 +4624,28 @@
   }];
 }
 
+def CUDADeviceBuiltinSurfaceTypeDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The ``device_builtin_surface_type`` attribute can be applied to a class
+template when declaring the surface reference. A surface reference variable
+could be accessed on the host side and, on the device side, might be translated
+into an internal surface object, which is established through surface bind and
+unbind runtime APIs.
+  }];
+}
+
+def CUDADeviceBuiltinTextureTypeDocs : Documentation {
+  let Category = DocCatType;
+  let Content = [{
+The ``device_builtin_texture_type`` attribute can be applied to a class
+template when declaring the texture reference. A texture reference variable
+could be accessed on the host side and, on the device side, might be translated
+into an internal texture object, which is established through texture bind and
+unbind runtime APIs.
+  }];
+}
+
 def LifetimeOwnerDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -4116,6 +4116,20 @@
   return Pointee->isVoidType() || Pointee->isRecordType();
 }
 
+/// Check if the specified type is the CUDA device builtin surface type.
+bool Type::isCUDADeviceBuiltinSurfaceType() const {
+  if (const auto *RT = getAs<RecordType>())
+    return RT->getDecl()->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>();
+  return false;
+}
+
+/// Check if the specified type is the CUDA device builtin texture type.
+bool Type::isCUDADeviceBuiltinTextureType() const {
+  if (const auto *RT = getAs<RecordType>())
+    return RT->getDecl()->hasAttr<CUDADeviceBuiltinTextureTypeAttr>();
+  return false;
+}
+
 bool Type::hasSizedVLAType() const {
   if (!isVariablyModifiedType()) return false;
 
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -50,7 +50,7 @@
   struct VarInfo {
     llvm::GlobalVariable *Var;
     const VarDecl *D;
-    unsigned Flag;
+    DeviceVarFlags Flags;
   };
   llvm::SmallVector<VarInfo, 16> DeviceVars;
   /// Keeps track of variable containing handle of GPU binary. Populated by
@@ -124,8 +124,25 @@
 
   void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
   void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
-                         unsigned Flags) override {
-    DeviceVars.push_back({&Var, VD, Flags});
+                         bool Extern, bool Constant) override {
+    DeviceVars.push_back({&Var,
+                          VD,
+                          {DeviceVarFlags::Variable, Extern, Constant,
+                           /*Normalized*/ false, /*Type*/ 0}});
+  }
+  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
+                          bool Extern, int Type) override {
+    DeviceVars.push_back({&Var,
+                          VD,
+                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
+                           /*Normalized*/ false, Type}});
+  }
+  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
+                         bool Extern, int Type, bool Normalized) override {
+    DeviceVars.push_back({&Var,
+                          VD,
+                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
+                           Normalized, Type}});
   }
 
   /// Creates module constructor function
@@ -431,22 +448,55 @@
   llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
       llvm::FunctionType::get(IntTy, RegisterVarParams, false),
       addUnderscoredPrefixToName("RegisterVar"));
+  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
+  //                            const void **, const char *, int, int);
+  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(
+          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
+          false),
+      addUnderscoredPrefixToName("RegisterSurface"));
+  // void __cudaRegisterTexture(void **, const struct textureReference *,
+  //                            const void **, const char *, int, int, int)
+  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(
+          VoidTy,
+          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
+          false),
+      addUnderscoredPrefixToName("RegisterTexture"));
   for (auto &&Info : DeviceVars) {
     llvm::GlobalVariable *Var = Info.Var;
-    unsigned Flags = Info.Flag;
     llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
-    uint64_t VarSize =
-        CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
-    llvm::Value *Args[] = {
-        &GpuBinaryHandlePtr,
-        Builder.CreateBitCast(Var, VoidPtrTy),
-        VarName,
-        VarName,
-        llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
-        llvm::ConstantInt::get(IntTy, VarSize),
-        llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
-        llvm::ConstantInt::get(IntTy, 0)};
-    Builder.CreateCall(RegisterVar, Args);
+    switch (Info.Flags.Kind) {
+    case DeviceVarFlags::Variable: {
+      uint64_t VarSize =
+          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
+      llvm::Value *Args[] = {&GpuBinaryHandlePtr,
+                             Builder.CreateBitCast(Var, VoidPtrTy),
+                             VarName,
+                             VarName,
+                             llvm::ConstantInt::get(IntTy, Info.Flags.Extern),
+                             llvm::ConstantInt::get(IntTy, VarSize),
+                             llvm::ConstantInt::get(IntTy, Info.Flags.Constant),
+                             llvm::ConstantInt::get(IntTy, 0)};
+      Builder.CreateCall(RegisterVar, Args);
+      break;
+    }
+    case DeviceVarFlags::Surface:
+      Builder.CreateCall(
+          RegisterSurf,
+          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
+           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.SurfTexType),
+           llvm::ConstantInt::get(IntTy, Info.Flags.Extern)});
+      break;
+    case DeviceVarFlags::Texture:
+      Builder.CreateCall(
+          RegisterTex,
+          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
+           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.SurfTexType),
+           llvm::ConstantInt::get(IntTy, Info.Flags.Normalized),
+           llvm::ConstantInt::get(IntTy, Info.Flags.Extern)});
+      break;
+    }
   }
 
   Builder.CreateRetVoid();
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -42,9 +42,17 @@
 
 public:
   // Global variable properties that must be passed to CUDA runtime.
-  enum DeviceVarFlags {
-    ExternDeviceVar = 0x01,   // extern
-    ConstantDeviceVar = 0x02, // __constant__
+  struct DeviceVarFlags {
+    enum DeviceVarKind {
+      Variable, // Variable
+      Surface,  // Builtin surface
+      Texture,  // Builtin texture
+    };
+    unsigned Kind : 2;
+    unsigned Extern : 1;
+    unsigned Constant : 2;   // Constant variable.
+    unsigned Normalized : 1; // Normalized texture.
+    int SurfTexType;         // Type of surface/texutre.
   };
 
   CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
@@ -57,7 +65,11 @@
   /// Emits a kernel launch stub.
   virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
   virtual void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
-                                 unsigned Flags) = 0;
+                                 bool Extern, bool Constant) = 0;
+  virtual void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
+                                  bool Extern, int Type) = 0;
+  virtual void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
+                                 bool Extern, int Type, bool Normalized) = 0;
 
   /// Constructs and returns a module initialization function or nullptr if it's
   /// not needed. Must be called after all kernels have been emitted.
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
+#include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
@@ -1937,6 +1938,18 @@
     }
   }
 
+  if (getLangOpts().CUDAIsDevice) {
+    if (Ty->isCUDADeviceBuiltinSurfaceType()) {
+      if (getTargetHooks().emitCUDADeviceBuiltinSurfaceDeviceCopy(*this, Dest,
+                                                                  Src))
+        return;
+    } else if (Ty->isCUDADeviceBuiltinTextureType()) {
+      if (getTargetHooks().emitCUDADeviceBuiltinTextureDeviceCopy(*this, Dest,
+                                                                  Src))
+        return;
+    }
+  }
+
   // Aggregate assignment turns into llvm.memcpy.  This is almost valid per
   // C99 6.5.16.1p3, which states "If the value being stored in an object is
   // read from another object that overlaps in anyway the storage of the first
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -698,6 +698,19 @@
 TBAAAccessInfo CodeGenModule::getTBAAAccessInfo(QualType AccessType) {
   if (!TBAA)
     return TBAAAccessInfo();
+  if (getLangOpts().CUDAIsDevice) {
+    // As CUDA builtin surface/texture types are replaced, skip generating TBAA
+    // access info.
+    if (AccessType->isCUDADeviceBuiltinSurfaceType()) {
+      if (getTargetCodeGenInfo().getCUDADeviceBuiltinSurfaceDeviceType() !=
+          nullptr)
+        return TBAAAccessInfo();
+    } else if (AccessType->isCUDADeviceBuiltinTextureType()) {
+      if (getTargetCodeGenInfo().getCUDADeviceBuiltinTextureDeviceType() !=
+          nullptr)
+        return TBAAAccessInfo();
+    }
+  }
   return TBAA->getAccessInfo(AccessType);
 }
 
@@ -2492,7 +2505,9 @@
           !Global->hasAttr<CUDAGlobalAttr>() &&
           !Global->hasAttr<CUDAConstantAttr>() &&
           !Global->hasAttr<CUDASharedAttr>() &&
-          !(LangOpts.HIP && Global->hasAttr<HIPPinnedShadowAttr>()))
+          !(LangOpts.HIP && Global->hasAttr<HIPPinnedShadowAttr>()) &&
+          !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
+          !Global->getType()->isCUDADeviceBuiltinTextureType())
         return;
     } else {
       // We need to emit host-side 'shadows' for all global
@@ -3945,12 +3960,16 @@
       !getLangOpts().CUDAIsDevice &&
       (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>() ||
        D->hasAttr<CUDASharedAttr>());
+  bool IsCUDADeviceShadowVar =
+      getLangOpts().CUDAIsDevice &&
+      (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
+       D->getType()->isCUDADeviceBuiltinTextureType());
   // HIP pinned shadow of initialized host-side global variables are also
   // left undefined.
   bool IsHIPPinnedShadowVar =
       getLangOpts().CUDAIsDevice && D->hasAttr<HIPPinnedShadowAttr>();
-  if (getLangOpts().CUDA &&
-      (IsCUDASharedVar || IsCUDAShadowVar || IsHIPPinnedShadowVar))
+  if (getLangOpts().CUDA && (IsCUDASharedVar || IsCUDAShadowVar ||
+                             IsCUDADeviceShadowVar || IsHIPPinnedShadowVar))
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
   else if (D->hasAttr<LoaderUninitializedAttr>())
     Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy));
@@ -4061,25 +4080,52 @@
       if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
           D->hasAttr<HIPPinnedShadowAttr>()) {
         Linkage = llvm::GlobalValue::InternalLinkage;
-
-        // Shadow variables and their properties must be registered
-        // with CUDA runtime.
-        unsigned Flags = 0;
-        if (!D->hasDefinition())
-          Flags |= CGCUDARuntime::ExternDeviceVar;
-        if (D->hasAttr<CUDAConstantAttr>())
-          Flags |= CGCUDARuntime::ConstantDeviceVar;
-        // Extern global variables will be registered in the TU where they are
-        // defined.
+        // Shadow variables and their properties must be registered with CUDA
+        // runtime. Skip Extern global variables, which will be registered in
+        // the TU where they are defined.
         if (!D->hasExternalStorage())
-          getCUDARuntime().registerDeviceVar(D, *GV, Flags);
-      } else if (D->hasAttr<CUDASharedAttr>())
+          getCUDARuntime().registerDeviceVar(D, *GV, !D->hasDefinition(),
+                                             D->hasAttr<CUDAConstantAttr>());
+      } else if (D->hasAttr<CUDASharedAttr>()) {
         // __shared__ variables are odd. Shadows do get created, but
         // they are not registered with the CUDA runtime, so they
         // can't really be used to access their device-side
         // counterparts. It's not clear yet whether it's nvcc's bug or
         // a feature, but we've got to do the same for compatibility.
         Linkage = llvm::GlobalValue::InternalLinkage;
+      } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
+                 D->getType()->isCUDADeviceBuiltinTextureType()) {
+        const RecordDecl *RD = D->getType()->getAs<RecordType>()->getDecl();
+        // Builtin surfaces and textures and their template arguments are
+        // also registered with CUDA runtime.
+        if (const ClassTemplateSpecializationDecl *TD =
+                dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
+          Linkage = llvm::GlobalValue::InternalLinkage;
+          const TemplateArgumentList &Args = TD->getTemplateInstantiationArgs();
+          if (RD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
+            assert(Args.size() == 2 &&
+                   "Unexpcted number of template arguments of CUDA device "
+                   "builtin surface type.");
+            auto Type = Args[1].getAsIntegral();
+            if (!D->hasExternalStorage())
+              getCUDARuntime().registerDeviceSurf(D, *GV, !D->hasDefinition(),
+                                                  Type.getSExtValue());
+          } else {
+            assert(Args.size() == 3 &&
+                   "Unexpected number of template arguments of CUDA device "
+                   "builtin texture type.");
+            auto Type = Args[1].getAsIntegral();
+            auto Normalized = Args[2].getAsIntegral();
+            assert(Normalized >= 0 && Normalized <= 1 &&
+                   "Unexpected normalized argument of CUDA device builtin "
+                   "texture type.");
+            if (!D->hasExternalStorage())
+              getCUDARuntime().registerDeviceTex(D, *GV, !D->hasDefinition(),
+                                                 Type.getSExtValue(),
+                                                 Normalized.getZExtValue());
+          }
+        }
+      }
     }
   }
 
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -383,6 +383,20 @@
 
   const Type *Ty = T.getTypePtr();
 
+  // For the device-side compilation, CUDA device builtin surface/texture types
+  // may be represented in different types.
+  if (Context.getLangOpts().CUDAIsDevice) {
+    if (T->isCUDADeviceBuiltinSurfaceType()) {
+      if (auto *Ty = CGM.getTargetCodeGenInfo()
+                         .getCUDADeviceBuiltinSurfaceDeviceType())
+        return Ty;
+    } else if (T->isCUDADeviceBuiltinTextureType()) {
+      if (auto *Ty = CGM.getTargetCodeGenInfo()
+                         .getCUDADeviceBuiltinTextureDeviceType())
+        return Ty;
+    }
+  }
+
   // RecordTypes are cached and processed specially.
   if (const RecordType *RT = dyn_cast<RecordType>(Ty))
     return ConvertRecordDeclType(RT->getDecl());
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -315,6 +315,32 @@
   virtual bool shouldEmitStaticExternCAliases() const { return true; }
 
   virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {}
+
+  /// Return the device-side type for the CUDA device builtin surface type.
+  virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const {
+    // By default, no change from the original one.
+    return nullptr;
+  }
+  /// Return the device-side type for the CUDA device builtin texture type.
+  virtual llvm::Type *getCUDADeviceBuiltinTextureDeviceType() const {
+    // By default, no change from the original one.
+    return nullptr;
+  }
+
+  /// Emit the device-side copy of the builtin surface type.
+  virtual bool emitCUDADeviceBuiltinSurfaceDeviceCopy(CodeGenFunction &CGF,
+                                                      LValue Dst,
+                                                      LValue Src) const {
+    // DO NOTHING by default.
+    return false;
+  }
+  /// Emit the device-side copy of the builtin texture type.
+  virtual bool emitCUDADeviceBuiltinTextureDeviceCopy(CodeGenFunction &CGF,
+                                                      LValue Dst,
+                                                      LValue Src) const {
+    // DO NOTHING by default.
+    return false;
+  }
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm> // std::sort
@@ -6435,10 +6436,51 @@
                            CodeGen::CodeGenModule &M) const override;
   bool shouldEmitStaticExternCAliases() const override;
 
+  llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const override {
+    return llvm::Type::getInt64Ty(getABIInfo().getVMContext());
+  }
+
+  llvm::Type *getCUDADeviceBuiltinTextureDeviceType() const override {
+    return llvm::Type::getInt64Ty(getABIInfo().getVMContext());
+  }
+
+  bool emitCUDADeviceBuiltinSurfaceDeviceCopy(CodeGenFunction &CGF, LValue Dst,
+                                              LValue Src) const override {
+    emitBuiltinSurfTexDeviceCopy(CGF, Dst, Src);
+    return true;
+  }
+
+  bool emitCUDADeviceBuiltinTextureDeviceCopy(CodeGenFunction &CGF, LValue Dst,
+                                              LValue Src) const override {
+    emitBuiltinSurfTexDeviceCopy(CGF, Dst, Src);
+    return true;
+  }
+
 private:
-  // Adds a NamedMDNode with F, Name, and Operand as operands, and adds the
+  // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
   // resulting MDNode to the nvvm.annotations MDNode.
-  static void addNVVMMetadata(llvm::Function *F, StringRef Name, int Operand);
+  static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
+                              int Operand);
+
+  static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
+                                           LValue Src) {
+    llvm::Value *Handle = nullptr;
+    llvm::Constant *C =
+        llvm::dyn_cast<llvm::Constant>(Src.getAddress(CGF).getPointer());
+    // Lookup `addrspacecast` through the constant pointer if any.
+    if (auto *ASC = llvm::dyn_cast_or_null<llvm::AddrSpaceCastOperator>(C))
+      C = llvm::cast<llvm::Constant>(ASC->getPointerOperand());
+    if (auto *GV = llvm::dyn_cast_or_null<llvm::GlobalVariable>(C)) {
+      // Load the handle from the specific global variable using
+      // `nvvm.texsurf.handle.internal` intrinsic.
+      Handle = CGF.EmitRuntimeCall(
+          CGF.CGM.getIntrinsic(llvm::Intrinsic::nvvm_texsurf_handle_internal,
+                               {GV->getType()}),
+          {GV}, "texsurf_handle");
+    } else
+      Handle = CGF.EmitLoadOfScalar(Src, SourceLocation());
+    CGF.EmitStoreOfScalar(Handle, Dst);
+  }
 };
 
 /// Checks if the type is unsupported directly by the current target.
@@ -6511,8 +6553,17 @@
     Ty = EnumTy->getDecl()->getIntegerType();
 
   // Return aggregates type as indirect by value
-  if (isAggregateTypeForABI(Ty))
+  if (isAggregateTypeForABI(Ty)) {
+    // Under CUDA device compilation, tex/surf builtin types are replaced with
+    // object types and passed directly.
+    if (getContext().getLangOpts().CUDAIsDevice) {
+      if (Ty->isCUDADeviceBuiltinSurfaceType())
+        return ABIArgInfo::getDirect(llvm::Type::getInt64Ty(getVMContext()));
+      if (Ty->isCUDADeviceBuiltinTextureType())
+        return ABIArgInfo::getDirect(llvm::Type::getInt64Ty(getVMContext()));
+    }
     return getNaturalAlignIndirect(Ty, /* byval */ true);
+  }
 
   return (Ty->isPromotableIntegerType() ? ABIArgInfo::getExtend(Ty)
                                         : ABIArgInfo::getDirect());
@@ -6540,6 +6591,17 @@
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
   if (GV->isDeclaration())
     return;
+  const VarDecl *VD = dyn_cast_or_null<VarDecl>(D);
+  if (VD) {
+    if (M.getLangOpts().CUDA) {
+      if (VD->getType()->isCUDADeviceBuiltinSurfaceType())
+        addNVVMMetadata(GV, "surface", 1);
+      else if (VD->getType()->isCUDADeviceBuiltinTextureType())
+        addNVVMMetadata(GV, "texture", 1);
+      return;
+    }
+  }
+
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
   if (!FD) return;
 
@@ -6588,16 +6650,16 @@
   }
 }
 
-void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::Function *F, StringRef Name,
-                                             int Operand) {
-  llvm::Module *M = F->getParent();
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
+                                             StringRef Name, int Operand) {
+  llvm::Module *M = GV->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
   // Get "nvvm.annotations" metadata node
   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
 
   llvm::Metadata *MDVals[] = {
-      llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, Name),
+      llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
       llvm::ConstantAsMetadata::get(
           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
   // Append metadata to nvvm.annotations
diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
--- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -83,11 +83,13 @@
 #if CUDA_VERSION < 9000
 #define __CUDABE__
 #else
+#define __CUDACC__
 #define __CUDA_LIBDEVICE__
 #endif
 // Disables definitions of device-side runtime support stubs in
 // cuda_device_runtime_api.h
 #include "driver_types.h"
+#undef __CUDACC__
 #include "host_config.h"
 #include "host_defines.h"
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6939,6 +6939,16 @@
     handleSimpleAttributeWithExclusions<HIPPinnedShadowAttr, CUDADeviceAttr,
                                         CUDAConstantAttr>(S, D, AL);
     break;
+  case ParsedAttr::AT_CUDADeviceBuiltinSurfaceType:
+    handleSimpleAttributeWithExclusions<CUDADeviceBuiltinSurfaceTypeAttr,
+                                        CUDADeviceBuiltinTextureTypeAttr>(S, D,
+                                                                          AL);
+    break;
+  case ParsedAttr::AT_CUDADeviceBuiltinTextureType:
+    handleSimpleAttributeWithExclusions<CUDADeviceBuiltinTextureTypeAttr,
+                                        CUDADeviceBuiltinSurfaceTypeAttr>(S, D,
+                                                                          AL);
+    break;
   case ParsedAttr::AT_GNUInline:
     handleGNUInlineAttr(S, D, AL);
     break;
diff --git a/clang/test/CodeGenCUDA/surface.cu b/clang/test/CodeGenCUDA/surface.cu
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCUDA/surface.cu
@@ -0,0 +1,37 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -o - %s | FileCheck --check-prefix=DEVICE %s
+// RUN: echo "GPU binary would be here" > %t
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -target-sdk-version=8.0 -fcuda-include-gpubinary %t -emit-llvm -o - %s | FileCheck --check-prefix=HOST %s
+
+struct surfaceReference {
+  int desc;
+};
+
+template<typename T, int type = 1>
+struct __attribute__((device_builtin_surface_type)) surface : public surfaceReference {
+};
+
+// On the device side, surface references are represented as `i64` handles.
+// DEVICE: @surf = addrspace(1) global i64 undef, align 4
+// On the host side, they remain in the original type.
+// HOST: @surf = internal global %struct.surface
+// HOST: @0 = private unnamed_addr constant [5 x i8] c"surf\00"
+surface<void, 2> surf;
+
+__attribute__((device)) int suld_2d_zero(surface<void, 2>, int, int) asm("llvm.nvvm.suld.2d.i32.zero");
+
+// DEVICE-LABEL: i32 @_Z3fooii(i32 %x, i32 %y)
+// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @surf)
+// DEVICE: call i32 @llvm.nvvm.suld.2d.i32.zero(i64 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+__attribute__((device)) int foo(int x, int y) {
+  return suld_2d_zero(surf, x, y);
+}
+
+// HOST: define internal void @[[PREFIX:__cuda]]_register_globals
+// Texture references need registering with correct arguments.
+// HOST: call void @[[PREFIX]]RegisterSurface(i8** %0, i8*{{.*}}({{.*}}@surf{{.*}}), i8*{{.*}}({{.*}}@0{{.*}}), i8*{{.*}}({{.*}}@0{{.*}}), i32 2, i32 0)
+
+// They also need annotating in metadata.
+// DEVICE: !0 = !{i64 addrspace(1)* @surf, !"surface", i32 1}
diff --git a/clang/test/CodeGenCUDA/texture.cu b/clang/test/CodeGenCUDA/texture.cu
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCUDA/texture.cu
@@ -0,0 +1,55 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -o - %s | FileCheck --check-prefix=DEVICE %s
+// RUN: echo "GPU binary would be here" > %t
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -target-sdk-version=8.0 -fcuda-include-gpubinary %t -emit-llvm -o - %s | FileCheck --check-prefix=HOST %s
+
+struct textureReference {
+  int desc;
+};
+
+enum ReadMode {
+  ElementType = 0,
+  NormalizedFloat = 1
+};
+
+template<typename T, int dim = 1, enum ReadMode mode = ElementType>
+struct __attribute__((device_builtin_texture_type)) texture : public textureReference {
+};
+
+// On the device side, texture references are represented as `i64` handles.
+// DEVICE: @tex = addrspace(1) global i64 undef, align 4
+// DEVICE: @norm = addrspace(1) global i64 undef, align 4
+// On the host side, they remain in the original type.
+// HOST: @tex = internal global %struct.texture
+// HOST: @norm = internal global %struct.texture
+// HOST: @0 = private unnamed_addr constant [4 x i8] c"tex\00"
+// HOST: @1 = private unnamed_addr constant [5 x i8] c"norm\00"
+texture<float, 2, ElementType> tex;
+texture<float, 2, NormalizedFloat> norm;
+
+struct v4f {
+  float x, y, z, w;
+};
+
+__attribute__((device)) v4f tex2d_ld(texture<float, 2, ElementType>, float, float) asm("llvm.nvvm.tex.unified.2d.v4f32.f32");
+__attribute__((device)) v4f tex2d_ld(texture<float, 2, NormalizedFloat>, int, int) asm("llvm.nvvm.tex.unified.2d.v4f32.s32");
+
+// DEVICE-LABEL: float @_Z3fooff(float %x, float %y)
+// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @tex)
+// DEVICE: call %struct.v4f @llvm.nvvm.tex.unified.2d.v4f32.f32(i64 %{{.*}}, float %{{.*}}, float %{{.*}})
+// DEVICE: call i64 @llvm.nvvm.texsurf.handle.internal.p1i64(i64 addrspace(1)* @norm)
+// DEVICE: call %struct.v4f @llvm.nvvm.tex.unified.2d.v4f32.s32(i64 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+__attribute__((device)) float foo(float x, float y) {
+  return tex2d_ld(tex, x, y).x + tex2d_ld(norm, int(x), int(y)).x;
+}
+
+// HOST: define internal void @[[PREFIX:__cuda]]_register_globals
+// Texture references need registering with correct arguments.
+// HOST: call void @[[PREFIX]]RegisterTexture(i8** %0, i8*{{.*}}({{.*}}@tex{{.*}}), i8*{{.*}}({{.*}}@0{{.*}}), i8*{{.*}}({{.*}}@0{{.*}}), i32 2, i32 0, i32 0)
+// HOST: call void @[[PREFIX]]RegisterTexture(i8** %0, i8*{{.*}}({{.*}}@norm{{.*}}), i8*{{.*}}({{.*}}@1{{.*}}), i8*{{.*}}({{.*}}@1{{.*}}), i32 2, i32 1, i32 0)
+
+// They also need annotating in metadata.
+// DEVICE: !0 = !{i64 addrspace(1)* @tex, !"texture", i32 1}
+// DEVICE: !1 = !{i64 addrspace(1)* @norm, !"texture", i32 1}
diff --git a/clang/test/SemaCUDA/attr-declspec.cu b/clang/test/SemaCUDA/attr-declspec.cu
--- a/clang/test/SemaCUDA/attr-declspec.cu
+++ b/clang/test/SemaCUDA/attr-declspec.cu
@@ -6,16 +6,21 @@
 // RUN: %clang_cc1 -DEXPECT_WARNINGS -fms-extensions -fsyntax-only -verify -x c %s
 
 #if defined(EXPECT_WARNINGS)
-// expected-warning@+12 {{'__device__' attribute ignored}}
-// expected-warning@+12 {{'__global__' attribute ignored}}
-// expected-warning@+12 {{'__constant__' attribute ignored}}
-// expected-warning@+12 {{'__shared__' attribute ignored}}
-// expected-warning@+12 {{'__host__' attribute ignored}}
+// expected-warning@+17 {{'__device__' attribute ignored}}
+// expected-warning@+17 {{'__global__' attribute ignored}}
+// expected-warning@+17 {{'__constant__' attribute ignored}}
+// expected-warning@+17 {{'__shared__' attribute ignored}}
+// expected-warning@+17 {{'__host__' attribute ignored}}
+// expected-warning@+22 {{'__device_builtin_surface_type__' attribute ignored}}
+// expected-warning@+22 {{'__device_builtin_texture_type__' attribute ignored}}
+// expected-warning@+22 {{'__device_builtin_surface_type__' attribute ignored}}
+// expected-warning@+22 {{'__device_builtin_texture_type__' attribute ignored}}
 //
 // (Currently we don't for the other attributes. They are implemented with
 // IgnoredAttr, which is ignored irrespective of any LangOpts.)
 #else
-// expected-no-diagnostics
+// expected-warning@+14 {{'__device_builtin_surface_type__' attribute only applies to types}}
+// expected-warning@+14 {{'__device_builtin_texture_type__' attribute only applies to types}}
 #endif
 
 __declspec(__device__) void f_device();
@@ -30,5 +35,7 @@
 __declspec(__cudart_builtin__) void f_cudart_builtin();
 __declspec(__device_builtin_surface_type__) unsigned long long surface_var;
 __declspec(__device_builtin_texture_type__) unsigned long long texture_var;
+struct __declspec(__device_builtin_surface_type__) surf_ref {};
+struct __declspec(__device_builtin_texture_type__) tex_ref {};
 
 // Note that there's no __declspec spelling of nv_weak.
diff --git a/clang/test/SemaCUDA/attributes-on-non-cuda.cu b/clang/test/SemaCUDA/attributes-on-non-cuda.cu
--- a/clang/test/SemaCUDA/attributes-on-non-cuda.cu
+++ b/clang/test/SemaCUDA/attributes-on-non-cuda.cu
@@ -7,16 +7,21 @@
 // RUN: %clang_cc1 -DEXPECT_WARNINGS -fsyntax-only -verify -x c %s
 
 #if defined(EXPECT_WARNINGS)
-// expected-warning@+12 {{'device' attribute ignored}}
-// expected-warning@+12 {{'global' attribute ignored}}
-// expected-warning@+12 {{'constant' attribute ignored}}
-// expected-warning@+12 {{'shared' attribute ignored}}
-// expected-warning@+12 {{'host' attribute ignored}}
+// expected-warning@+17 {{'device' attribute ignored}}
+// expected-warning@+17 {{'global' attribute ignored}}
+// expected-warning@+17 {{'constant' attribute ignored}}
+// expected-warning@+17 {{'shared' attribute ignored}}
+// expected-warning@+17 {{'host' attribute ignored}}
+// expected-warning@+23 {{'device_builtin_surface_type' attribute ignored}}
+// expected-warning@+23 {{'device_builtin_texture_type' attribute ignored}}
+// expected-warning@+23 {{'device_builtin_surface_type' attribute ignored}}
+// expected-warning@+23 {{'device_builtin_texture_type' attribute ignored}}
 //
 // NOTE: IgnoredAttr in clang which is used for the rest of
 // attributes ignores LangOpts, so there are no warnings.
 #else
-// expected-no-diagnostics
+// expected-warning@+15 {{'device_builtin_surface_type' attribute only applies to types}}
+// expected-warning@+15 {{'device_builtin_texture_type' attribute only applies to types}}
 #endif
 
 __attribute__((device)) void f_device();
@@ -32,3 +37,5 @@
 __attribute__((nv_weak)) void f_nv_weak();
 __attribute__((device_builtin_surface_type)) unsigned long long surface_var;
 __attribute__((device_builtin_texture_type)) unsigned long long texture_var;
+struct __attribute__((device_builtin_surface_type)) surf_ref {};
+struct __attribute__((device_builtin_texture_type)) tex_ref {};
diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h
--- a/llvm/include/llvm/IR/Operator.h
+++ b/llvm/include/llvm/IR/Operator.h
@@ -599,6 +599,25 @@
   }
 };
 
+class AddrSpaceCastOperator
+    : public ConcreteOperator<Operator, Instruction::AddrSpaceCast> {
+  friend class AddrSpaceCastInst;
+  friend class ConstantExpr;
+
+public:
+  Value *getPointerOperand() { return getOperand(0); }
+
+  const Value *getPointerOperand() const { return getOperand(0); }
+
+  unsigned getSrcAddressSpace() const {
+    return getPointerOperand()->getType()->getPointerAddressSpace();
+  }
+
+  unsigned getDestAddressSpace() const {
+    return getType()->getPointerAddressSpace();
+  }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_IR_OPERATOR_H