Index: include/clang/AST/ASTContext.h
===================================================================
--- include/clang/AST/ASTContext.h
+++ include/clang/AST/ASTContext.h
@@ -2301,12 +2301,12 @@
                                              QualType typeDomain) const;
 
   unsigned getTargetAddressSpace(QualType T) const {
+    if (T->isFunctionType() && !T.getQualifiers().hasAddressSpace())
+      return 0;
     return getTargetAddressSpace(T.getQualifiers());
   }
 
-  unsigned getTargetAddressSpace(Qualifiers Q) const {
-    return getTargetAddressSpace(Q.getAddressSpace());
-  }
+  unsigned getTargetAddressSpace(Qualifiers Q) const;
 
   unsigned getTargetAddressSpace(unsigned AS) const {
     if (AS < LangAS::Offset || AS >= LangAS::Offset + LangAS::Count)
@@ -2319,6 +2319,8 @@
   /// constant folding.
   uint64_t getTargetNullPointerValue(QualType QT) const;
 
+  unsigned getTargetConstantAddressSpace() const;
+
   bool addressSpaceMapManglingFor(unsigned AS) const {
     return AddrSpaceMapMangling || 
            AS < LangAS::Offset || 
Index: include/clang/Basic/TargetInfo.h
===================================================================
--- include/clang/Basic/TargetInfo.h
+++ include/clang/Basic/TargetInfo.h
@@ -307,6 +307,10 @@
     return 0;
   }
 
+  virtual unsigned getConstantAddressSpace() const {
+    return 0;
+  }
+
   /// \brief Return the size of '_Bool' and C++ 'bool' for this target, in bits.
   unsigned getBoolWidth() const { return BoolWidth; }
 
@@ -953,6 +957,10 @@
     return *AddrSpaceMap;
   }
 
+  virtual unsigned getDefaultTargetAddressSpace(const LangOptions &Opt) const {
+    return 0;
+  }
+
   /// \brief Retrieve the name of the platform as it is used in the
   /// availability attribute.
   StringRef getPlatformName() const { return PlatformName; }
Index: lib/AST/ASTContext.cpp
===================================================================
--- lib/AST/ASTContext.cpp
+++ lib/AST/ASTContext.cpp
@@ -9538,6 +9538,16 @@
   return getTargetInfo().getNullPointerValue(AS);
 }
 
+unsigned ASTContext::getTargetConstantAddressSpace() const {
+  return getTargetInfo().getConstantAddressSpace();
+}
+
+unsigned ASTContext::getTargetAddressSpace(Qualifiers Q) const {
+  return Q.hasAddressSpace()
+             ? getTargetAddressSpace(Q.getAddressSpace())
+             : getTargetInfo().getDefaultTargetAddressSpace(LangOpts);
+}
+
 // Explicitly instantiate this in case a Redeclarable<T> is used from a TU that
 // doesn't include ASTContext.h
 template
Index: lib/Basic/Targets.cpp
===================================================================
--- lib/Basic/Targets.cpp
+++ lib/Basic/Targets.cpp
@@ -1997,16 +1997,6 @@
   return llvm::makeArrayRef(GCCRegNames);
 }
 
-static const unsigned AMDGPUAddrSpaceMap[] = {
-  1,    // opencl_global
-  3,    // opencl_local
-  2,    // opencl_constant
-  4,    // opencl_generic
-  1,    // cuda_device
-  2,    // cuda_constant
-  3     // cuda_shared
-};
-
 // If you edit the description strings, make sure you update
 // getPointerWidthV().
 
@@ -2020,9 +2010,18 @@
   "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 
 class AMDGPUTargetInfo final : public TargetInfo {
+  static const unsigned AddrSpaceMap_[7];
   static const Builtin::Info BuiltinInfo[];
   static const char * const GCCRegNames[];
 
+  enum AddrSpaceKind {
+    AS_Private = 0,
+    AS_Global = 1,
+    AS_Constant = 2,
+    AS_Local = 3,
+    AS_Generic = 4
+  };
+
   /// \brief The GPU profiles supported by the AMDGPU target.
   enum GPUKind {
     GK_NONE,
@@ -2066,7 +2065,7 @@
     resetDataLayout(getTriple().getArch() == llvm::Triple::amdgcn ?
                     DataLayoutStringSI : DataLayoutStringR600);
 
-    AddrSpaceMap = &AMDGPUAddrSpaceMap;
+    AddrSpaceMap = &AddrSpaceMap_;
     UseAddrSpaceMapMangling = true;
   }
 
@@ -2254,6 +2253,15 @@
     }
   }
 
+  unsigned
+  getDefaultTargetAddressSpace(const LangOptions &Opts) const override {
+    // OpenCL sets address space explicitly in AST. The default case (type
+    // qualifier containing no address space) represents private address space.
+    if (Opts.OpenCL)
+      return AS_Private;
+    return AS_Generic;
+  }
+
   LangAS::ID getOpenCLImageAddrSpace() const override {
     return LangAS::opencl_constant;
   }
@@ -2276,6 +2284,16 @@
   }
 };
 
+const unsigned AMDGPUTargetInfo::AddrSpaceMap_[] = {
+    AS_Global,   // opencl_global
+    AS_Local,    // opencl_local
+    AS_Constant, // opencl_constant
+    AS_Generic,  // opencl_generic
+    AS_Global,   // cuda_device
+    AS_Constant, // cuda_constant
+    AS_Local     // cuda_shared
+};
+
 const Builtin::Info AMDGPUTargetInfo::BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS)                \
   { #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
Index: lib/CodeGen/CGBuiltin.cpp
===================================================================
--- lib/CodeGen/CGBuiltin.cpp
+++ lib/CodeGen/CGBuiltin.cpp
@@ -2312,7 +2312,7 @@
   case Builtin::BI__GetExceptionInfo: {
     if (llvm::GlobalVariable *GV =
             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
-      return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
+      return RValue::get(llvm::ConstantExpr::getPointerCast(GV, CGM.Int8PtrTy));
     break;
   }
 
Index: lib/CodeGen/CGCall.cpp
===================================================================
--- lib/CodeGen/CGCall.cpp
+++ lib/CodeGen/CGCall.cpp
@@ -3643,18 +3643,19 @@
   if (llvm::StructType *ArgStruct = CallInfo.getArgStruct()) {
     ArgMemoryLayout = CGM.getDataLayout().getStructLayout(ArgStruct);
     llvm::Instruction *IP = CallArgs.getStackBase();
-    llvm::AllocaInst *AI;
+    llvm::Instruction *CastedAI;
     if (IP) {
       IP = IP->getNextNode();
-      AI = new llvm::AllocaInst(ArgStruct, "argmem", IP);
+      CastedAI = CreateAlloca(ArgStruct, "argmem", IP);
     } else {
-      AI = CreateTempAlloca(ArgStruct, "argmem");
+      CastedAI = CreateTempAlloca(ArgStruct, "argmem");
     }
     auto Align = CallInfo.getArgStructAlignment();
+    auto *AI = getAddrSpaceCastedAlloca(CastedAI);
     AI->setAlignment(Align.getQuantity());
     AI->setUsedWithInAlloca(true);
     assert(AI->isUsedWithInAlloca() && !AI->isStaticAlloca());
-    ArgMemory = Address(AI, Align);
+    ArgMemory = Address(CastedAI, Align);
   }
 
   // Helper function to drill into the inalloca allocation.
Index: lib/CodeGen/CGClass.cpp
===================================================================
--- lib/CodeGen/CGClass.cpp
+++ lib/CodeGen/CGClass.cpp
@@ -2372,12 +2372,16 @@
 
   // Finally, store the address point. Use the same LLVM types as the field to
   // support optimization.
+  auto DefAddr = CGM.getTarget().getDefaultTargetAddressSpace(
+      CGM.getLangOpts());
   llvm::Type *VTablePtrTy =
       llvm::FunctionType::get(CGM.Int32Ty, /*isVarArg=*/true)
-          ->getPointerTo()
-          ->getPointerTo();
-  VTableField = Builder.CreateBitCast(VTableField, VTablePtrTy->getPointerTo());
-  VTableAddressPoint = Builder.CreateBitCast(VTableAddressPoint, VTablePtrTy);
+          ->getPointerTo(DefAddr)
+          ->getPointerTo(DefAddr);
+  VTableField = Builder.CreatePointerBitCastOrAddrSpaceCast(VTableField,
+      VTablePtrTy->getPointerTo(DefAddr));
+  VTableAddressPoint = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      VTableAddressPoint, VTablePtrTy);
 
   llvm::StoreInst *Store = Builder.CreateStore(VTableAddressPoint, VTableField);
   CGM.DecorateInstructionWithTBAA(Store, CGM.getTBAAInfoForVTablePtr());
Index: lib/CodeGen/CGDecl.cpp
===================================================================
--- lib/CodeGen/CGDecl.cpp
+++ lib/CodeGen/CGDecl.cpp
@@ -1075,7 +1075,15 @@
     llvm::AllocaInst *vla = Builder.CreateAlloca(llvmTy, elementCount, "vla");
     vla->setAlignment(alignment.getQuantity());
 
-    address = Address(vla, alignment);
+    llvm::Value *V = vla;
+    auto DefaultAddr = getTarget().getDefaultTargetAddressSpace(getLangOpts());
+    if (DefaultAddr != 0) {
+      auto *DestTy =
+          llvm::PointerType::get(vla->getType()->getElementType(), DefaultAddr);
+      V = Builder.CreateAddrSpaceCast(vla, DestTy);
+    }
+
+    address = Address(V, alignment);
   }
 
   setAddrOfLocalVar(&D, address);
@@ -1244,7 +1252,7 @@
     // Otherwise, create a temporary global with the initializer then
     // memcpy from the global to the alloca.
     std::string Name = getStaticDeclName(CGM, D);
-    unsigned AS = 0;
+    unsigned AS = CGM.getContext().getTargetConstantAddressSpace();
     if (getLangOpts().OpenCL) {
       AS = CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant);
       BP = llvm::PointerType::getInt8PtrTy(getLLVMContext(), AS);
Index: lib/CodeGen/CGDeclCXX.cpp
===================================================================
--- lib/CodeGen/CGDeclCXX.cpp
+++ lib/CodeGen/CGDeclCXX.cpp
@@ -135,7 +135,7 @@
   CharUnits WidthChars = CGF.getContext().getTypeSizeInChars(D.getType());
   uint64_t Width = WidthChars.getQuantity();
   llvm::Value *Args[2] = { llvm::ConstantInt::getSigned(CGF.Int64Ty, Width),
-                           llvm::ConstantExpr::getBitCast(Addr, CGF.Int8PtrTy)};
+                           llvm::ConstantExpr::getPointerCast(Addr, CGF.Int8PtrTy)};
   CGF.Builder.CreateCall(InvariantStart, Args);
 }
 
Index: lib/CodeGen/CGException.cpp
===================================================================
--- lib/CodeGen/CGException.cpp
+++ lib/CodeGen/CGException.cpp
@@ -237,7 +237,7 @@
 static llvm::Constant *getOpaquePersonalityFn(CodeGenModule &CGM,
                                         const EHPersonality &Personality) {
   llvm::Constant *Fn = getPersonalityFn(CGM, Personality);
-  return llvm::ConstantExpr::getBitCast(Fn, CGM.Int8PtrTy);
+  return llvm::ConstantExpr::getPointerCast(Fn, CGM.Int8PtrTy);
 }
 
 /// Check whether a landingpad instruction only uses C++ features.
@@ -1520,7 +1520,7 @@
     llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration(
         &CGM.getModule(), llvm::Intrinsic::localrecover);
     llvm::Constant *ParentI8Fn =
-        llvm::ConstantExpr::getBitCast(ParentCGF.CurFn, Int8PtrTy);
+        llvm::ConstantExpr::getPointerCast(ParentCGF.CurFn, Int8PtrTy);
     RecoverCall = Builder.CreateCall(
         FrameRecoverFn, {ParentI8Fn, ParentFP,
                          llvm::ConstantInt::get(Int32Ty, FrameEscapeIdx)});
@@ -1585,7 +1585,7 @@
     llvm::Function *RecoverFPIntrin =
         CGM.getIntrinsic(llvm::Intrinsic::x86_seh_recoverfp);
     llvm::Constant *ParentI8Fn =
-        llvm::ConstantExpr::getBitCast(ParentCGF.CurFn, Int8PtrTy);
+        llvm::ConstantExpr::getPointerCast(ParentCGF.CurFn, Int8PtrTy);
     ParentFP = Builder.CreateCall(RecoverFPIntrin, {ParentI8Fn, EntryFP});
   }
 
@@ -1812,7 +1812,7 @@
   llvm::Function *FilterFunc =
       HelperCGF.GenerateSEHFilterFunction(*this, *Except);
   llvm::Constant *OpaqueFunc =
-      llvm::ConstantExpr::getBitCast(FilterFunc, Int8PtrTy);
+      llvm::ConstantExpr::getPointerCast(FilterFunc, Int8PtrTy);
   CatchScope->setHandler(0, OpaqueFunc, createBasicBlock("__except.ret"));
 }
 
Index: lib/CodeGen/CGExpr.cpp
===================================================================
--- lib/CodeGen/CGExpr.cpp
+++ lib/CodeGen/CGExpr.cpp
@@ -62,16 +62,37 @@
 /// block.
 Address CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
                                           const Twine &Name) {
-  auto Alloca = CreateTempAlloca(Ty, Name);
+  auto CastedAlloca = CreateTempAlloca(Ty, Name);
+  auto *Alloca = getAddrSpaceCastedAlloca(CastedAlloca);
   Alloca->setAlignment(Align.getQuantity());
-  return Address(Alloca, Align);
+  return Address(CastedAlloca, Align);
 }
 
 /// CreateTempAlloca - This creates a alloca and inserts it into the entry
 /// block.
-llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
-                                                    const Twine &Name) {
-  return new llvm::AllocaInst(Ty, nullptr, Name, AllocaInsertPt);
+llvm::Instruction *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
+                                                     const Twine &Name) {
+  return CreateAlloca(Ty, Name, AllocaInsertPt);
+}
+
+llvm::Instruction *CodeGenFunction::CreateAlloca(llvm::Type *Ty,
+                                                 const Twine &Name,
+                                                 llvm::Instruction *InsertPos) {
+  llvm::Instruction *V = new llvm::AllocaInst(Ty, nullptr, Name, InsertPos);
+  auto DefaultAddr = getTarget().getDefaultTargetAddressSpace(getLangOpts());
+  if (DefaultAddr != 0) {
+    auto *DestTy = llvm::PointerType::get(V->getType()->getPointerElementType(),
+                                          DefaultAddr);
+    V = new llvm::AddrSpaceCastInst(V, DestTy, "", InsertPos);
+  }
+  return V;
+}
+
+llvm::AllocaInst *
+CodeGenFunction::getAddrSpaceCastedAlloca(llvm::Instruction *V) const {
+  if (auto *Cast = dyn_cast<llvm::AddrSpaceCastInst>(V))
+    return cast<llvm::AllocaInst>(Cast->getOperand(0));
+  return cast<llvm::AllocaInst>(V);
 }
 
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
Index: lib/CodeGen/CGExprConstant.cpp
===================================================================
--- lib/CodeGen/CGExprConstant.cpp
+++ lib/CodeGen/CGExprConstant.cpp
@@ -1316,7 +1316,7 @@
       if (!Offset->isNullValue()) {
         unsigned AS = C->getType()->getPointerAddressSpace();
         llvm::Type *CharPtrTy = Int8Ty->getPointerTo(AS);
-        llvm::Constant *Casted = llvm::ConstantExpr::getBitCast(C, CharPtrTy);
+        llvm::Constant *Casted = llvm::ConstantExpr::getPointerCast(C, CharPtrTy);
         Casted = llvm::ConstantExpr::getGetElementPtr(Int8Ty, Casted, Offset);
         C = llvm::ConstantExpr::getPointerCast(Casted, C->getType());
       }
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -924,7 +924,7 @@
       DefaultOpenMPPSource =
           CGM.GetAddrOfConstantCString(";unknown;unknown;0;0;;").getPointer();
       DefaultOpenMPPSource =
-          llvm::ConstantExpr::getBitCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
+          llvm::ConstantExpr::getPointerCast(DefaultOpenMPPSource, CGM.Int8PtrTy);
     }
 
     ConstantInitBuilder builder(CGM);
@@ -2918,7 +2918,7 @@
   llvm::Module &M = CGM.getModule();
 
   // Make sure the address has the right type.
-  llvm::Constant *AddrPtr = llvm::ConstantExpr::getBitCast(ID, CGM.VoidPtrTy);
+  llvm::Constant *AddrPtr = llvm::ConstantExpr::getPointerCast(ID, CGM.VoidPtrTy);
 
   // Create constant string with the name.
   llvm::Constant *StrPtrInit = llvm::ConstantDataArray::getString(C, Name);
@@ -2928,7 +2928,7 @@
                                llvm::GlobalValue::InternalLinkage, StrPtrInit,
                                ".omp_offloading.entry_name");
   Str->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-  llvm::Constant *StrPtr = llvm::ConstantExpr::getBitCast(Str, CGM.Int8PtrTy);
+  llvm::Constant *StrPtr = llvm::ConstantExpr::getPointerCast(Str, CGM.Int8PtrTy);
 
   // We can't have any padding between symbols, so we need to have 1-byte
   // alignment.
@@ -4871,7 +4871,7 @@
   // the device, because these functions will be entry points to the device.
 
   if (CGM.getLangOpts().OpenMPIsDevice) {
-    OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy);
+    OutlinedFnID = llvm::ConstantExpr::getPointerCast(OutlinedFn, CGM.Int8PtrTy);
     OutlinedFn->setLinkage(llvm::GlobalValue::ExternalLinkage);
   } else
     OutlinedFnID = new llvm::GlobalVariable(
Index: lib/CodeGen/CGVTT.cpp
===================================================================
--- lib/CodeGen/CGVTT.cpp
+++ lib/CodeGen/CGVTT.cpp
@@ -84,7 +84,7 @@
          VTable->getValueType(), VTable, Idxs, /*InBounds=*/true,
          /*InRangeIndex=*/1);
 
-     Init = llvm::ConstantExpr::getBitCast(Init, Int8PtrTy);
+     Init = llvm::ConstantExpr::getPointerCast(Init, Int8PtrTy);
 
      VTTComponents.push_back(Init);
   }
Index: lib/CodeGen/CGVTables.cpp
===================================================================
--- lib/CodeGen/CGVTables.cpp
+++ lib/CodeGen/CGVTables.cpp
@@ -550,7 +550,7 @@
     return addOffsetConstant(component.getOffsetToTop());
 
   case VTableComponent::CK_RTTI:
-    return builder.add(llvm::ConstantExpr::getBitCast(rtti, CGM.Int8PtrTy));
+    return builder.add(llvm::ConstantExpr::getPointerCast(rtti, CGM.Int8PtrTy));
 
   case VTableComponent::CK_FunctionPointer:
   case VTableComponent::CK_CompleteDtorPointer:
@@ -594,7 +594,7 @@
       llvm::Constant *fn = CGM.CreateRuntimeFunction(fnTy, name);
       if (auto f = dyn_cast<llvm::Function>(fn))
         f->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-      return llvm::ConstantExpr::getBitCast(fn, CGM.Int8PtrTy);
+      return llvm::ConstantExpr::getPointerCast(fn, CGM.Int8PtrTy);
     };
 
     llvm::Constant *fnPtr;
@@ -628,7 +628,7 @@
       fnPtr = CGM.GetAddrOfFunction(GD, fnTy, /*ForVTable=*/true);
     }
 
-    fnPtr = llvm::ConstantExpr::getBitCast(fnPtr, CGM.Int8PtrTy);
+    fnPtr = llvm::ConstantExpr::getPointerCast(fnPtr, CGM.Int8PtrTy);
     builder.add(fnPtr);
     return;
   }
Index: lib/CodeGen/CodeGenFunction.h
===================================================================
--- lib/CodeGen/CodeGenFunction.h
+++ lib/CodeGen/CodeGenFunction.h
@@ -377,7 +377,7 @@
   };
 
   /// i32s containing the indexes of the cleanup destinations.
-  llvm::AllocaInst *NormalCleanupDest;
+  llvm::Instruction *NormalCleanupDest;
 
   unsigned NextCleanupDestIndex;
 
@@ -392,8 +392,8 @@
   llvm::Value *ExceptionSlot;
 
   /// The selector slot.  Under the MandatoryCleanup model, all landing pads
-  /// write the current selector value into this alloca.
-  llvm::AllocaInst *EHSelectorSlot;
+  /// write the current selector value into this instruction.
+  llvm::Instruction *EHSelectorSlot;
 
   /// A stack of exception code slots. Entering an __except block pushes a slot
   /// on the stack and leaving pops one. The __exception_code() intrinsic loads
@@ -428,11 +428,11 @@
 
     /// An i1 variable indicating whether or not the @finally is
     /// running for an exception.
-    llvm::AllocaInst *ForEHVar;
+    llvm::Instruction *ForEHVar;
 
     /// An i8* variable into which the exception pointer to rethrow
     /// has been saved.
-    llvm::AllocaInst *SavedExnVar;
+    llvm::Instruction *SavedExnVar;
 
   public:
     void enter(CodeGenFunction &CGF, const Stmt *Finally,
@@ -1858,14 +1858,23 @@
                             AlignmentSource *Source = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
+  /// Create an alloca instruction. If the default address space is not 0,
+  /// insert addrspacecast instruction which casts the alloca instruction
+  /// to the default address space.
+  llvm::Instruction *CreateAlloca(llvm::Type *Ty, const Twine &Name = "tmp",
+                                  llvm::Instruction *InsertPos = nullptr);
   /// CreateTempAlloca - This creates a alloca and inserts it into the entry
   /// block. The caller is responsible for setting an appropriate alignment on
-  /// the alloca.
-  llvm::AllocaInst *CreateTempAlloca(llvm::Type *Ty,
-                                     const Twine &Name = "tmp");
+  /// the alloca. If the default address space is not 0, insert addrspacecast.
+  llvm::Instruction *CreateTempAlloca(llvm::Type *Ty,
+                                      const Twine &Name = "tmp");
   Address CreateTempAlloca(llvm::Type *Ty, CharUnits align,
                            const Twine &Name = "tmp");
 
+  /// Get alloca instruction operand of an addrspacecast instruction.
+  /// If \p Inst is alloca instruction, returns \p Inst;
+  llvm::AllocaInst *getAddrSpaceCastedAlloca(llvm::Instruction *Inst) const;
+
   /// CreateDefaultAlignedTempAlloca - This creates an alloca with the
   /// default ABI alignment of the given LLVM type.
   ///
Index: lib/CodeGen/CodeGenFunction.cpp
===================================================================
--- lib/CodeGen/CodeGenFunction.cpp
+++ lib/CodeGen/CodeGenFunction.cpp
@@ -442,7 +442,7 @@
     "callsite");
 
   llvm::Value *args[] = {
-    llvm::ConstantExpr::getBitCast(CurFn, PointerTy),
+    llvm::ConstantExpr::getPointerCast(CurFn, PointerTy),
     CallSite
   };
 
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -47,6 +47,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
@@ -90,6 +91,7 @@
       VMContext(M.getContext()), Types(*this), VTables(*this),
       SanitizerMD(new SanitizerMetadata(*this)) {
 
+  unsigned DefaultTargetAddressSpace = Target.getDefaultTargetAddressSpace(LangOpts);
   // Initialize the type cache.
   llvm::LLVMContext &LLVMContext = M.getContext();
   VoidTy = llvm::Type::getVoidTy(LLVMContext);
@@ -99,9 +101,9 @@
   Int64Ty = llvm::Type::getInt64Ty(LLVMContext);
   FloatTy = llvm::Type::getFloatTy(LLVMContext);
   DoubleTy = llvm::Type::getDoubleTy(LLVMContext);
-  PointerWidthInBits = C.getTargetInfo().getPointerWidth(0);
+  PointerWidthInBits = C.getTargetInfo().getPointerWidth(DefaultTargetAddressSpace);
   PointerAlignInBytes =
-    C.toCharUnitsFromBits(C.getTargetInfo().getPointerAlign(0)).getQuantity();
+    C.toCharUnitsFromBits(C.getTargetInfo().getPointerAlign(DefaultTargetAddressSpace)).getQuantity();
   SizeSizeInBytes =
     C.toCharUnitsFromBits(C.getTargetInfo().getMaxPointerWidth()).getQuantity();
   IntAlignInBytes =
@@ -109,8 +111,8 @@
   IntTy = llvm::IntegerType::get(LLVMContext, C.getTargetInfo().getIntWidth());
   IntPtrTy = llvm::IntegerType::get(LLVMContext,
     C.getTargetInfo().getMaxPointerWidth());
-  Int8PtrTy = Int8Ty->getPointerTo(0);
-  Int8PtrPtrTy = Int8PtrTy->getPointerTo(0);
+  Int8PtrTy = Int8Ty->getPointerTo(DefaultTargetAddressSpace);
+  Int8PtrPtrTy = Int8PtrTy->getPointerTo(DefaultTargetAddressSpace);
 
   RuntimeCC = getTargetCodeGenInfo().getABIInfo().getRuntimeCC();
   BuiltinCC = getTargetCodeGenInfo().getABIInfo().getBuiltinCC();
@@ -750,7 +752,7 @@
     ctor.addInt(Int32Ty, I.Priority);
     ctor.add(llvm::ConstantExpr::getBitCast(I.Initializer, CtorPFTy));
     if (I.AssociatedData)
-      ctor.add(llvm::ConstantExpr::getBitCast(I.AssociatedData, VoidPtrTy));
+      ctor.add(llvm::ConstantExpr::getPointerCast(I.AssociatedData, VoidPtrTy));
     else
       ctor.addNullPointer(VoidPtrTy);
     ctor.finishAndAddTo(ctors);
@@ -1418,10 +1420,13 @@
                  *LineNoCst = EmitAnnotationLineNo(L);
 
   // Create the ConstantStruct for the global annotation.
+  unsigned AS = GV->getType()->getAddressSpace();
+  llvm::PointerType *I8PTy = (AS == Int8PtrTy->getAddressSpace()) ?
+    Int8PtrTy : Int8Ty->getPointerTo(AS);
   llvm::Constant *Fields[4] = {
-    llvm::ConstantExpr::getBitCast(GV, Int8PtrTy),
-    llvm::ConstantExpr::getBitCast(AnnoGV, Int8PtrTy),
-    llvm::ConstantExpr::getBitCast(UnitGV, Int8PtrTy),
+    llvm::ConstantExpr::getPointerCast(GV, I8PTy),
+    llvm::ConstantExpr::getPointerCast(AnnoGV, I8PTy),
+    llvm::ConstantExpr::getPointerCast(UnitGV, I8PTy),
     LineNoCst
   };
   return llvm::ConstantStruct::getAnon(Fields);
@@ -1548,7 +1553,7 @@
   llvm::GlobalValue *Entry = GetGlobalValue(AA->getAliasee());
   if (Entry) {
     unsigned AS = getContext().getTargetAddressSpace(VD->getType());
-    auto Ptr = llvm::ConstantExpr::getBitCast(Entry, DeclTy->getPointerTo(AS));
+    auto Ptr = llvm::ConstantExpr::getPointerCast(Entry, DeclTy->getPointerTo(AS));
     return ConstantAddress(Ptr, Alignment);
   }
 
@@ -1900,7 +1905,7 @@
 /// GetOrCreateLLVMFunction - If the specified mangled name is not in the
 /// module, create and return an llvm Function with the specified type. If there
 /// is something in the module with the specified name, return it potentially
-/// bitcasted to the right type.
+/// casted to the right type.
 ///
 /// If D is non-null, it specifies a decl that correspond to this.  This is used
 /// to set the attributes on the function when it is first created.
@@ -1952,7 +1957,7 @@
     // (If function is requested for a definition, we always need to create a new
     // function, not just return a bitcast.)
     if (!IsForDefinition)
-      return llvm::ConstantExpr::getBitCast(Entry, Ty->getPointerTo());
+      return llvm::ConstantExpr::getPointerCast(Entry, Ty->getPointerTo());
   }
 
   // This function doesn't have a complete type (for example, the return
@@ -2060,7 +2065,7 @@
   }
 
   llvm::Type *PTy = llvm::PointerType::getUnqual(Ty);
-  return llvm::ConstantExpr::getBitCast(F, PTy);
+  return llvm::ConstantExpr::getPointerCast(F, PTy);
 }
 
 /// GetAddrOfFunction - Return the address of the given function.  If Ty is
@@ -2189,7 +2194,7 @@
 /// GetOrCreateLLVMGlobal - If the specified mangled name is not in the module,
 /// create and return an llvm GlobalVariable with the specified type.  If there
 /// is something in the module with the specified name, return it potentially
-/// bitcasted to the right type.
+/// casted to the right type.
 ///
 /// If D is non-null, it specifies a decl that correspond to this.  This is used
 /// to set the attributes on the global when it is first created.
@@ -2237,14 +2242,10 @@
       }
     }
 
-    // Make sure the result is of the correct type.
-    if (Entry->getType()->getAddressSpace() != Ty->getAddressSpace())
-      return llvm::ConstantExpr::getAddrSpaceCast(Entry, Ty);
-
     // (If global is requested for a definition, we always need to create a new
     // global, not just return a bitcast.)
     if (!IsForDefinition)
-      return llvm::ConstantExpr::getBitCast(Entry, Ty);
+      return llvm::ConstantExpr::getPointerCast(Entry, Ty);
   }
 
   unsigned AddrSpace = GetGlobalVarAddressSpace(D, Ty->getAddressSpace());
@@ -2260,7 +2261,7 @@
 
     if (!Entry->use_empty()) {
       llvm::Constant *NewPtrForOldDecl =
-          llvm::ConstantExpr::getBitCast(GV, Entry->getType());
+          llvm::ConstantExpr::getPointerCast(GV, Entry->getType());
       Entry->replaceAllUsesWith(NewPtrForOldDecl);
     }
 
@@ -2372,7 +2373,7 @@
     
     if (!OldGV->use_empty()) {
       llvm::Constant *NewPtrForOldDecl =
-      llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
+      llvm::ConstantExpr::getPointerCast(GV, OldGV->getType());
       OldGV->replaceAllUsesWith(NewPtrForOldDecl);
     }
     
@@ -2621,7 +2622,7 @@
 
     // Replace all uses of the old global with the new global
     llvm::Constant *NewPtrForOldDecl =
-        llvm::ConstantExpr::getBitCast(GV, Entry->getType());
+        llvm::ConstantExpr::getPointerCast(GV, Entry->getType());
     Entry->replaceAllUsesWith(NewPtrForOldDecl);
 
     // Erase the old global, since it is no longer used.
@@ -3116,7 +3117,7 @@
     // Remove it and replace uses of it with the alias.
     GA->takeName(Entry);
 
-    Entry->replaceAllUsesWith(llvm::ConstantExpr::getBitCast(GA,
+    Entry->replaceAllUsesWith(llvm::ConstantExpr::getPointerCast(GA,
                                                           Entry->getType()));
     Entry->eraseFromParent();
   } else {
@@ -3334,7 +3335,7 @@
 
   if (isUTF16)
     // Cast the UTF16 string to the correct type.
-    Str = llvm::ConstantExpr::getBitCast(Str, Int8PtrTy);
+    Str = llvm::ConstantExpr::getPointerCast(Str, Int8PtrTy);
   Fields.add(Str);
 
   // String length.
Index: lib/CodeGen/ItaniumCXXABI.cpp
===================================================================
--- lib/CodeGen/ItaniumCXXABI.cpp
+++ lib/CodeGen/ItaniumCXXABI.cpp
@@ -1108,7 +1108,7 @@
     if (!Record->hasTrivialDestructor()) {
       CXXDestructorDecl *DtorD = Record->getDestructor();
       Dtor = CGM.getAddrOfCXXStructor(DtorD, StructorType::Complete);
-      Dtor = llvm::ConstantExpr::getBitCast(Dtor, CGM.Int8PtrTy);
+      Dtor = llvm::ConstantExpr::getPointerCast(Dtor, CGM.Int8PtrTy);
     }
   }
   if (!Dtor) Dtor = llvm::Constant::getNullValue(CGM.Int8PtrTy);
@@ -2171,7 +2171,7 @@
 
   llvm::Value *args[] = {
     llvm::ConstantExpr::getBitCast(dtor, dtorTy),
-    llvm::ConstantExpr::getBitCast(addr, CGF.Int8PtrTy),
+    llvm::ConstantExpr::getPointerCast(addr, CGF.Int8PtrTy),
     handle
   };
   CGF.EmitNounwindRuntimeCall(atexit, args);
@@ -2584,7 +2584,7 @@
     }
   }
 
-  return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
+  return llvm::ConstantExpr::getPointerCast(GV, CGM.Int8PtrTy);
 }
 
 /// TypeInfoIsInStandardLibrary - Given a builtin type, returns whether the type
@@ -2913,7 +2913,7 @@
   llvm::Constant *Two = llvm::ConstantInt::get(PtrDiffTy, 2);
   VTable =
       llvm::ConstantExpr::getInBoundsGetElementPtr(CGM.Int8PtrTy, VTable, Two);
-  VTable = llvm::ConstantExpr::getBitCast(VTable, CGM.Int8PtrTy);
+  VTable = llvm::ConstantExpr::getPointerCast(VTable, CGM.Int8PtrTy);
 
   Fields.push_back(VTable);
 }
@@ -2986,7 +2986,7 @@
     assert(!OldGV->hasAvailableExternallyLinkage() &&
            "available_externally typeinfos not yet implemented");
 
-    return llvm::ConstantExpr::getBitCast(OldGV, CGM.Int8PtrTy);
+    return llvm::ConstantExpr::getPointerCast(OldGV, CGM.Int8PtrTy);
   }
 
   // Check if there is already an external RTTI descriptor for this type.
@@ -3022,7 +3022,7 @@
     TypeNameField =
         llvm::ConstantExpr::getIntToPtr(TypeNameField, CGM.Int8PtrTy);
   } else {
-    TypeNameField = llvm::ConstantExpr::getBitCast(TypeName, CGM.Int8PtrTy);
+    TypeNameField = llvm::ConstantExpr::getPointerCast(TypeName, CGM.Int8PtrTy);
   }
   Fields.push_back(TypeNameField);
 
@@ -3177,7 +3177,7 @@
     }
   }
 
-  return llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy);
+  return llvm::ConstantExpr::getPointerCast(GV, CGM.Int8PtrTy);
 }
 
 /// BuildObjCObjectTypeInfo - Build the appropriate kind of type_info
Index: test/CodeGenCUDA/address-spaces.cu
===================================================================
--- test/CodeGenCUDA/address-spaces.cu
+++ test/CodeGenCUDA/address-spaces.cu
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -fcuda-is-device -triple nvptx-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -fcuda-is-device -triple nvptx-unknown-unknown | FileCheck --check-prefixes=NVPTX,CHECK %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -fcuda-is-device -triple amdgcn | FileCheck --check-prefixes=AMDGCN,CHECK %s
 
 // Verifies Clang emits correct address spaces and addrspacecast instructions
 // for CUDA code.
@@ -8,7 +9,8 @@
 // CHECK: @i = addrspace(1) externally_initialized global
 __device__ int i;
 
-// CHECK: @j = addrspace(4) externally_initialized global
+// AMDGCN: @j = addrspace(2) externally_initialized global
+// NVPTX: @j = addrspace(4) externally_initialized global
 __constant__ int j;
 
 // CHECK: @k = addrspace(3) global
@@ -27,17 +29,21 @@
 // CHECK: @b = addrspace(3) global float undef
 
 __device__ void foo() {
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
+  // NVPTX: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
+  // AMDGCN: load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @i to i32 addrspace(4)*)
   i++;
 
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(4)* @j to i32*)
+  // NVPTX: load i32, i32* addrspacecast (i32 addrspace(4)* @j to i32*)
+  // AMDGCN: load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(2)* @j to i32 addrspace(4)*)
   j++;
 
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @k to i32*)
+  // NVPTX: load i32, i32* addrspacecast (i32 addrspace(3)* @k to i32*)
+  // AMDGCN: load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @k to i32 addrspace(4)*)
   k++;
 
   __shared__ int lk;
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
+  // NVPTX: load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
+  // AMDGCN: load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32 addrspace(4)*)
   lk++;
 }
 
@@ -47,8 +53,9 @@
   ap->data1 = 1;
   ap->data2 = 2;
 }
-// CHECK: define void @_Z5func0v()
-// CHECK: store %struct.MyStruct* addrspacecast (%struct.MyStruct addrspace(3)* @_ZZ5func0vE1a to %struct.MyStruct*), %struct.MyStruct** %ap
+// CHECK-LABEL: define void @_Z5func0v()
+// NVPTX: store %struct.MyStruct* addrspacecast (%struct.MyStruct addrspace(3)* @_ZZ5func0vE1a to %struct.MyStruct*), %struct.MyStruct** %ap
+// AMDGCN: store %struct.MyStruct addrspace(4)* addrspacecast (%struct.MyStruct addrspace(3)* @_ZZ5func0vE1a to %struct.MyStruct addrspace(4)*), %struct.MyStruct addrspace(4)* addrspace(4)* %ap
 
 __device__ void callee(float *ap) {
   *ap = 1.0f;
@@ -58,37 +65,42 @@
   __shared__ float a;
   callee(&a); // implicit cast from parameters
 }
-// CHECK: define void @_Z5func1v()
-// CHECK: call void @_Z6calleePf(float* addrspacecast (float addrspace(3)* @_ZZ5func1vE1a to float*))
+// CHECK-LABEL: define void @_Z5func1v()
+// NVPTX: call void @_Z6calleePf(float* addrspacecast (float addrspace(3)* @_ZZ5func1vE1a to float*))
+// AMDGCN: call void @_Z6calleePf(float addrspace(4)* addrspacecast (float addrspace(3)* @_ZZ5func1vE1a to float addrspace(4)*))
 
 __device__ void func2() {
   __shared__ float a[256];
   float *ap = &a[128]; // implicit cast from a decayed array
   *ap = 1.0f;
 }
-// CHECK: define void @_Z5func2v()
-// CHECK: store float* getelementptr inbounds ([256 x float], [256 x float]* addrspacecast ([256 x float] addrspace(3)* @_ZZ5func2vE1a to [256 x float]*), i32 0, i32 128), float** %ap
-
+// CHECK-LABEL: define void @_Z5func2v()
+// NVPTX: store float* getelementptr inbounds ([256 x float], [256 x float]* addrspacecast ([256 x float] addrspace(3)* @_ZZ5func2vE1a to [256 x float]*), i32 0, i32 128), float** %ap
+// AMDGCN: store float addrspace(4)* getelementptr inbounds ([256 x float], [256 x float] addrspace(4)* addrspacecast ([256 x float] addrspace(3)* @_ZZ5func2vE1a to [256 x float] addrspace(4)*), i64 0, i64 128), float addrspace(4)* addrspace(4)* %ap
 __device__ void func3() {
   __shared__ float a;
   float *ap = reinterpret_cast<float *>(&a); // explicit cast
   *ap = 1.0f;
 }
-// CHECK: define void @_Z5func3v()
-// CHECK: store float* addrspacecast (float addrspace(3)* @_ZZ5func3vE1a to float*), float** %ap
+// CHECK-LABEL: define void @_Z5func3v()
+// NVPTX: store float* addrspacecast (float addrspace(3)* @_ZZ5func3vE1a to float*), float** %ap
+// AMDGCN: store float addrspace(4)* addrspacecast (float addrspace(3)* @_ZZ5func3vE1a to float addrspace(4)*), float addrspace(4)* addrspace(4)* %ap
 
 __device__ void func4() {
   __shared__ float a;
   float *ap = (float *)&a; // explicit c-style cast
   *ap = 1.0f;
 }
-// CHECK: define void @_Z5func4v()
-// CHECK: store float* addrspacecast (float addrspace(3)* @_ZZ5func4vE1a to float*), float** %ap
+// CHECK-LABEL: define void @_Z5func4v()
+// NVPTX: store float* addrspacecast (float addrspace(3)* @_ZZ5func4vE1a to float*), float** %ap
+// AMDGCN: store float addrspace(4)* addrspacecast (float addrspace(3)* @_ZZ5func4vE1a to float addrspace(4)*), float addrspace(4)* addrspace(4)* %ap
 
 __shared__ float b;
 
 __device__ float *func5() {
   return &b; // implicit cast from a return value
 }
-// CHECK: define float* @_Z5func5v()
-// CHECK: ret float* addrspacecast (float addrspace(3)* @b to float*)
+// NVPTX-LABEL: define float* @_Z5func5v()
+// AMDGCN-LABEL: define float addrspace(4)* @_Z5func5v()
+// NVPTX: ret float* addrspacecast (float addrspace(3)* @b to float*)
+// AMDGCN: ret float addrspace(4)* addrspacecast (float addrspace(3)* @b to float addrspace(4)*)
Index: test/CodeGenCUDA/convergent.cu
===================================================================
--- test/CodeGenCUDA/convergent.cu
+++ test/CodeGenCUDA/convergent.cu
@@ -2,6 +2,9 @@
 // REQUIRES: nvptx-registered-target
 
 // RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s -DNVPTX | FileCheck -check-prefixes=DEVICE,NVPTX %s
+
+// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn -emit-llvm \
 // RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix DEVICE %s
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
@@ -25,9 +28,11 @@
 __host__ __device__ void bar() {
   // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
   baz();
-  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
+  #ifdef NVPTX
+  // NVPTX: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
   int x;
   asm ("trap;" : "=l"(x));
+  #endif
   // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
   asm volatile ("trap;");
 }
Index: test/CodeGenCUDA/device-var-init.cu
===================================================================
--- test/CodeGenCUDA/device-var-init.cu
+++ test/CodeGenCUDA/device-var-init.cu
@@ -4,7 +4,10 @@
 // variables, but accept empty constructors allowed by CUDA.
 
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \
-// RUN:     -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck %s
+// RUN:     -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,NVPTX %s
+
+// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -std=c++11 \
+// RUN:     -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCN %s
 
 #ifdef __clang__
 #include "Inputs/cuda.h"
@@ -18,7 +21,8 @@
 __shared__ int s_v;
 // CHECK: @s_v = addrspace(3) global i32 undef,
 __constant__ int c_v;
-// CHECK: addrspace(4) externally_initialized global i32 0,
+// NVPTX: addrspace(4) externally_initialized global i32 0,
+// AMDGCN: addrspace(2) externally_initialized global i32 0,
 
 __device__ int d_v_i = 1;
 // CHECK: @d_v_i = addrspace(1) externally_initialized global i32 1,
@@ -29,12 +33,14 @@
 __shared__ T s_t;
 // CHECK: @s_t = addrspace(3) global %struct.T undef,
 __constant__ T c_t;
-// CHECK: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer,
+// NVPTX: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer,
+// AMDGCN: @c_t = addrspace(2) externally_initialized global %struct.T zeroinitializer,
 
 __device__ T d_t_i = {2};
 // CHECK: @d_t_i = addrspace(1) externally_initialized global %struct.T { i32 2 },
 __constant__ T c_t_i = {2};
-// CHECK: @c_t_i = addrspace(4) externally_initialized global %struct.T { i32 2 },
+// NVPTX: @c_t_i = addrspace(4) externally_initialized global %struct.T { i32 2 },
+// AMDGCN: @c_t_i = addrspace(2) externally_initialized global %struct.T { i32 2 },
 
 // empty constructor
 __device__ EC d_ec;
@@ -42,7 +48,8 @@
 __shared__ EC s_ec;
 // CHECK: @s_ec = addrspace(3) global %struct.EC undef,
 __constant__ EC c_ec;
-// CHECK: @c_ec = addrspace(4) externally_initialized global %struct.EC zeroinitializer,
+// NVPTX: @c_ec = addrspace(4) externally_initialized global %struct.EC zeroinitializer,
+// AMDGCN: @c_ec = addrspace(2) externally_initialized global %struct.EC zeroinitializer,
 
 // empty destructor
 __device__ ED d_ed;
@@ -50,14 +57,16 @@
 __shared__ ED s_ed;
 // CHECK: @s_ed = addrspace(3) global %struct.ED undef,
 __constant__ ED c_ed;
-// CHECK: @c_ed = addrspace(4) externally_initialized global %struct.ED zeroinitializer,
+// NVPTX: @c_ed = addrspace(4) externally_initialized global %struct.ED zeroinitializer,
+// AMDGCN: @c_ed = addrspace(2) externally_initialized global %struct.ED zeroinitializer,
 
 __device__ ECD d_ecd;
 // CHECK: @d_ecd = addrspace(1) externally_initialized global %struct.ECD zeroinitializer,
 __shared__ ECD s_ecd;
 // CHECK: @s_ecd = addrspace(3) global %struct.ECD undef,
 __constant__ ECD c_ecd;
-// CHECK: @c_ecd = addrspace(4) externally_initialized global %struct.ECD zeroinitializer,
+// NVPTX: @c_ecd = addrspace(4) externally_initialized global %struct.ECD zeroinitializer,
+// AMDGCN: @c_ecd = addrspace(2) externally_initialized global %struct.ECD zeroinitializer,
 
 // empty templated constructor -- allowed with no arguments
 __device__ ETC d_etc;
@@ -65,12 +74,14 @@
 __shared__ ETC s_etc;
 // CHECK: @s_etc = addrspace(3) global %struct.ETC undef,
 __constant__ ETC c_etc;
-// CHECK: @c_etc = addrspace(4) externally_initialized global %struct.ETC zeroinitializer,
+// NVPTX: @c_etc = addrspace(4) externally_initialized global %struct.ETC zeroinitializer,
+// AMDGCN: @c_etc = addrspace(2) externally_initialized global %struct.ETC zeroinitializer,
 
 __device__ NCFS d_ncfs;
 // CHECK: @d_ncfs = addrspace(1) externally_initialized global %struct.NCFS { i32 3 }
 __constant__ NCFS c_ncfs;
-// CHECK: @c_ncfs = addrspace(4) externally_initialized global %struct.NCFS { i32 3 }
+// NVPTX: @c_ncfs = addrspace(4) externally_initialized global %struct.NCFS { i32 3 }
+// AMDGCN: @c_ncfs = addrspace(2) externally_initialized global %struct.NCFS { i32 3 }
 
 // Regular base class -- allowed
 __device__ T_B_T d_t_b_t;
@@ -78,7 +89,8 @@
 __shared__ T_B_T s_t_b_t;
 // CHECK: @s_t_b_t = addrspace(3) global %struct.T_B_T undef,
 __constant__ T_B_T c_t_b_t;
-// CHECK: @c_t_b_t = addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer,
+// NVPTX: @c_t_b_t = addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer,
+// AMDGCN: @c_t_b_t = addrspace(2) externally_initialized global %struct.T_B_T zeroinitializer,
 
 // Incapsulated object of allowed class -- allowed
 __device__ T_F_T d_t_f_t;
@@ -86,7 +98,8 @@
 __shared__ T_F_T s_t_f_t;
 // CHECK: @s_t_f_t = addrspace(3) global %struct.T_F_T undef,
 __constant__ T_F_T c_t_f_t;
-// CHECK: @c_t_f_t = addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer,
+// NVPTX: @c_t_f_t = addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer,
+// AMDGCN: @c_t_f_t = addrspace(2) externally_initialized global %struct.T_F_T zeroinitializer,
 
 // array of allowed objects -- allowed
 __device__ T_FA_T d_t_fa_t;
@@ -94,7 +107,8 @@
 __shared__ T_FA_T s_t_fa_t;
 // CHECK: @s_t_fa_t = addrspace(3) global %struct.T_FA_T undef,
 __constant__ T_FA_T c_t_fa_t;
-// CHECK: @c_t_fa_t = addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer,
+// NVPTX: @c_t_fa_t = addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer,
+// AMDGCN: @c_t_fa_t = addrspace(2) externally_initialized global %struct.T_FA_T zeroinitializer,
 
 
 // Calling empty base class initializer is OK
@@ -103,7 +117,8 @@
 __shared__ EC_I_EC s_ec_i_ec;
 // CHECK: @s_ec_i_ec = addrspace(3) global %struct.EC_I_EC undef,
 __constant__ EC_I_EC c_ec_i_ec;
-// CHECK: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer,
+// NVPTX: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer,
+// AMDGCN: @c_ec_i_ec = addrspace(2) externally_initialized global %struct.EC_I_EC zeroinitializer,
 
 // We should not emit global initializers for device-side variables.
 // CHECK-NOT: @__cxx_global_var_init
@@ -114,41 +129,53 @@
   T t;
   // CHECK-NOT: call
   EC ec;
-  // CHECK:   call void @_ZN2ECC1Ev(%struct.EC* %ec)
+  // NVPTX:   call void @_ZN2ECC1Ev(%struct.EC* %ec)
+  // AMDGCN:   call void @_ZN2ECC1Ev(%struct.EC addrspace(4)* %ec)
   ED ed;
   // CHECK-NOT: call
   ECD ecd;
-  // CHECK:   call void @_ZN3ECDC1Ev(%struct.ECD* %ecd)
+  // NVPTX:   call void @_ZN3ECDC1Ev(%struct.ECD* %ecd)
+  // AMDGCN:   call void @_ZN3ECDC1Ev(%struct.ECD addrspace(4)* %ecd)
   ETC etc;
-  // CHECK:   call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc)
+  // NVPTX:   call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc)
+  // AMDGCN:   call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC addrspace(4)* %etc)
   UC uc;
   // undefined constructor -- not allowed
-  // CHECK:   call void @_ZN2UCC1Ev(%struct.UC* %uc)
+  // NVPTX:   call void @_ZN2UCC1Ev(%struct.UC* %uc)
+  // AMDGCN:   call void @_ZN2UCC1Ev(%struct.UC addrspace(4)* %uc)
   UD ud;
   // undefined destructor -- not allowed
   // CHECK-NOT: call
   ECI eci;
   // empty constructor w/ initializer list -- not allowed
-  // CHECK:   call void @_ZN3ECIC1Ev(%struct.ECI* %eci)
+  // NVPTX:   call void @_ZN3ECIC1Ev(%struct.ECI* %eci)
+  // AMDGCN:   call void @_ZN3ECIC1Ev(%struct.ECI addrspace(4)* %eci)
   NEC nec;
   // non-empty constructor -- not allowed
-  // CHECK:   call void @_ZN3NECC1Ev(%struct.NEC* %nec)
+  // NVPTX:   call void @_ZN3NECC1Ev(%struct.NEC* %nec)
+  // AMDGCN:   call void @_ZN3NECC1Ev(%struct.NEC addrspace(4)* %nec)
   // non-empty destructor -- not allowed
   NED ned;
   // no-constructor,  virtual method -- not allowed
-  // CHECK:   call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)
+  // NVPTX:   call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)
+  // AMDGCN:   call void @_ZN3NCVC1Ev(%struct.NCV addrspace(4)* %ncv)
   NCV ncv;
   // CHECK-NOT: call
   VD vd;
-  // CHECK:   call void @_ZN2VDC1Ev(%struct.VD* %vd)
+  // NVPTX:   call void @_ZN2VDC1Ev(%struct.VD* %vd)
+  // AMDGCN:   call void @_ZN2VDC1Ev(%struct.VD addrspace(4)* %vd)
   NCF ncf;
-  // CHECK:   call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)
+  // NVPTX:   call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)
+  // AMDGCN:   call void @_ZN3NCFC1Ev(%struct.NCF addrspace(4)* %ncf)
   NCFS ncfs;
-  // CHECK:   call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)
+  // NVPTX:   call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)
+  // AMDGCN:   call void @_ZN4NCFSC1Ev(%struct.NCFS addrspace(4)* %ncfs)
   UTC utc;
-  // CHECK:   call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)
+  // NVPTX:   call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)
+  // AMDGCN:   call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC addrspace(4)* %utc)
   NETC netc;
-  // CHECK:   call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)
+  // NVPTX:   call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)
+  // AMDGCN:   call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC addrspace(4)* %netc)
   T_B_T t_b_t;
   // CHECK-NOT: call
   T_F_T t_f_t;
@@ -156,17 +183,23 @@
   T_FA_T t_fa_t;
   // CHECK-NOT: call
   EC_I_EC ec_i_ec;
-  // CHECK:   call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)
+  // NVPTX:   call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)
+  // AMDGCN:   call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC addrspace(4)* %ec_i_ec)
   EC_I_EC1 ec_i_ec1;
-  // CHECK:   call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)
+  // NVPTX:   call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)
+  // AMDGCN:   call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1 addrspace(4)* %ec_i_ec1)
   T_V_T t_v_t;
-  // CHECK:   call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t)
+  // NVPTX:   call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t)
+  // AMDGCN:   call void @_ZN5T_V_TC1Ev(%struct.T_V_T addrspace(4)* %t_v_t)
   T_B_NEC t_b_nec;
-  // CHECK:   call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)
+  // NVPTX:   call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)
+  // AMDGCN:   call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC addrspace(4)* %t_b_nec)
   T_F_NEC t_f_nec;
-  // CHECK:   call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)
+  // NVPTX:   call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)
+  // AMDGCN:   call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC addrspace(4)* %t_f_nec)
   T_FA_NEC t_fa_nec;
-  // CHECK:   call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)
+  // NVPTX:   call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)
+  // AMDGCN:   call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC addrspace(4)* %t_fa_nec)
   T_B_NED t_b_ned;
   // CHECK-NOT: call
   T_F_NED t_f_ned;
@@ -174,22 +207,33 @@
   T_FA_NED t_fa_ned;
   // CHECK-NOT: call
   static __shared__ EC s_ec;
-  // CHECK-NOT: call void @_ZN2ECC1Ev(%struct.EC* addrspacecast (%struct.EC addrspace(3)* @_ZZ2dfvE4s_ec to %struct.EC*))
+  // NVPTX-NOT: call void @_ZN2ECC1Ev(%struct.EC* addrspacecast (%struct.EC addrspace(3)* @_ZZ2dfvE4s_ec to %struct.EC*))
+  // AMDGCN-NOT: call void @_ZN2ECC1Ev(%struct.EC addrspace(4)* addrspacecast (%struct.EC addrspace(3)* @_ZZ2dfvE4s_ec to %struct.EC addrspace(4)*))
   static __shared__ ETC s_etc;
-  // CHECK-NOT: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* addrspacecast (%struct.ETC addrspace(3)* @_ZZ2dfvE5s_etc to %struct.ETC*))
+  // NVPTX-NOT: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* addrspacecast (%struct.ETC addrspace(3)* @_ZZ2dfvE5s_etc to %struct.ETC*))
+  // AMDGCN-NOT: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC addrspace(4)* addrspacecast (%struct.ETC addrspace(3)* @_ZZ2dfvE5s_etc to %struct.ETC addrspace(4)*))
 
   // anchor point separating constructors and destructors
   df(); // CHECK: call void @_Z2dfv()
 
   // Verify that we only call non-empty destructors
-  // CHECK-NEXT: call void @_ZN8T_FA_NEDD1Ev(%struct.T_FA_NED* %t_fa_ned)
-  // CHECK-NEXT: call void @_ZN7T_F_NEDD1Ev(%struct.T_F_NED* %t_f_ned)
-  // CHECK-NEXT: call void @_ZN7T_B_NEDD1Ev(%struct.T_B_NED* %t_b_ned)
-  // CHECK-NEXT: call void @_ZN2VDD1Ev(%struct.VD* %vd)
-  // CHECK-NEXT: call void @_ZN3NEDD1Ev(%struct.NED* %ned)
-  // CHECK-NEXT: call void @_ZN2UDD1Ev(%struct.UD* %ud)
-  // CHECK-NEXT: call void @_ZN3ECDD1Ev(%struct.ECD* %ecd)
-  // CHECK-NEXT: call void @_ZN2EDD1Ev(%struct.ED* %ed)
+  // NVPTX-NEXT: call void @_ZN8T_FA_NEDD1Ev(%struct.T_FA_NED* %t_fa_ned)
+  // NVPTX-NEXT: call void @_ZN7T_F_NEDD1Ev(%struct.T_F_NED* %t_f_ned)
+  // NVPTX-NEXT: call void @_ZN7T_B_NEDD1Ev(%struct.T_B_NED* %t_b_ned)
+  // NVPTX-NEXT: call void @_ZN2VDD1Ev(%struct.VD* %vd)
+  // NVPTX-NEXT: call void @_ZN3NEDD1Ev(%struct.NED* %ned)
+  // NVPTX-NEXT: call void @_ZN2UDD1Ev(%struct.UD* %ud)
+  // NVPTX-NEXT: call void @_ZN3ECDD1Ev(%struct.ECD* %ecd)
+  // NVPTX-NEXT: call void @_ZN2EDD1Ev(%struct.ED* %ed)
+
+  // AMDGCN-NEXT: call void @_ZN8T_FA_NEDD1Ev(%struct.T_FA_NED addrspace(4)* %t_fa_ned)
+  // AMDGCN-NEXT: call void @_ZN7T_F_NEDD1Ev(%struct.T_F_NED addrspace(4)* %t_f_ned)
+  // AMDGCN-NEXT: call void @_ZN7T_B_NEDD1Ev(%struct.T_B_NED addrspace(4)* %t_b_ned)
+  // AMDGCN-NEXT: call void @_ZN2VDD1Ev(%struct.VD addrspace(4)* %vd)
+  // AMDGCN-NEXT: call void @_ZN3NEDD1Ev(%struct.NED addrspace(4)* %ned)
+  // AMDGCN-NEXT: call void @_ZN2UDD1Ev(%struct.UD addrspace(4)* %ud)
+  // AMDGCN-NEXT: call void @_ZN3ECDD1Ev(%struct.ECD addrspace(4)* %ecd)
+  // AMDGCN-NEXT: call void @_ZN2EDD1Ev(%struct.ED addrspace(4)* %ed)
 
   // CHECK-NEXT: ret void
 }
Index: test/CodeGenCUDA/device-vtable.cu
===================================================================
--- test/CodeGenCUDA/device-vtable.cu
+++ test/CodeGenCUDA/device-vtable.cu
@@ -10,6 +10,8 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-HOST -check-prefix=CHECK-BOTH
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH
+// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck %s -check-prefix=CHECK-DEVICE -check-prefix=CHECK-BOTH
 
 #include "Inputs/cuda.h"
 
Index: test/CodeGenCUDA/filter-decl.cu
===================================================================
--- test/CodeGenCUDA/filter-decl.cu
+++ test/CodeGenCUDA/filter-decl.cu
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-HOST %s
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefix=CHECK-DEVICE %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefixes=CHECK-DEVICE,ITANIUM %s
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm %s -o - -fcuda-is-device | FileCheck -check-prefixes=CHECK-DEVICE,AMDGCN %s
 
 #include "Inputs/cuda.h"
 
@@ -10,15 +11,18 @@
 __asm__("file scope asm is host only");
 
 // CHECK-HOST: constantdata = internal global
-// CHECK-DEVICE: constantdata = externally_initialized global
+// ITANIUM: constantdata = externally_initialized global
+// AMDGCN: constantdata = addrspace(2) externally_initialized global
 __constant__ char constantdata[256];
 
 // CHECK-HOST: devicedata = internal global
-// CHECK-DEVICE: devicedata = externally_initialized global
+// ITANIUM: devicedata = externally_initialized global
+// AMDGCN: devicedata = addrspace(1) externally_initialized global
 __device__ char devicedata[256];
 
 // CHECK-HOST: shareddata = internal global
-// CHECK-DEVICE: shareddata = global
+// ITANIUM: shareddata = global
+// AMDGCN: shareddata = addrspace(3) global
 __shared__ char shareddata[256];
 
 // CHECK-HOST: hostdata = global
Index: test/CodeGenCUDA/function-overload.cu
===================================================================
--- test/CodeGenCUDA/function-overload.cu
+++ test/CodeGenCUDA/function-overload.cu
@@ -8,6 +8,8 @@
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
 // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
+// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -emit-llvm -o - %s \
+// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
 
 #include "Inputs/cuda.h"
 
Index: test/CodeGenCUDA/kernel-args-alignment.cu
===================================================================
--- test/CodeGenCUDA/kernel-args-alignment.cu
+++ test/CodeGenCUDA/kernel-args-alignment.cu
@@ -1,8 +1,11 @@
 // RUN: %clang_cc1 --std=c++11 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | \
-// RUN:  FileCheck -check-prefix HOST -check-prefix CHECK %s
+// RUN:  FileCheck -check-prefixes=HOST,CHECK %s
 
 // RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple nvptx64-nvidia-cuda \
-// RUN:   -emit-llvm -o - %s | FileCheck -check-prefix DEVICE -check-prefix CHECK %s
+// RUN:   -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,CHECK,NVPTX %s
+
+// RUN: %clang_cc1 --std=c++11 -fcuda-is-device -triple amdgcn-amd-amdhsa \
+// RUN:   -emit-llvm -o - %s -DAMDGCN| FileCheck -check-prefixes=DEVICE,CHECK,AMDGCN %s
 
 #include "Inputs/cuda.h"
 
@@ -18,9 +21,17 @@
 
 // Clang should generate a packed LLVM struct for S (denoted by the <>s),
 // otherwise this test isn't interesting.
-// CHECK: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
-
+// HOST: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
+// NVPTX: %struct.S = type <{ i32*, i8, %struct.U, [5 x i8] }>
+// ToDo: Fix padding on amdgcn target to be the same as host
+// AMDGCN: %struct.S = type <{ i32 addrspace(4)*, i8, %struct.U, i8 }>
+
+// ToDo: Fix struct padding on amdgcn so that alignof(S) == 8
+#ifdef AMDGCN
+static_assert(alignof(S) == 4, "Unexpected alignment.");
+#else
 static_assert(alignof(S) == 8, "Unexpected alignment.");
+#endif
 
 // HOST-LABEL: @_Z6kernelc1SPi
 // Marshalled kernel args should be:
@@ -32,5 +43,7 @@
 // HOST: call i32 @cudaSetupArgument({{[^,]*}}, i64 8, i64 24)
 
 // DEVICE-LABEL: @_Z6kernelc1SPi
-// DEVICE-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
+// NVPTX-SAME: i8{{[^,]*}}, %struct.S* byval align 8{{[^,]*}}, i32*
+// ToDo: Fix amdgcn so that align of struct.S is 8
+// AMDGCN-SAME: i8{{[^,]*}}, %struct.S* byval align 4{{[^,]*}}, i32 addrspace(4)*
 __global__ void kernel(char a, S s, int *b) {}
Index: test/OpenMP/nvptx_parallel_codegen.cpp
===================================================================
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -2,6 +2,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=amdgcn -emit-llvm-bc %s -o %t-x86-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics