diff --git a/.gitignore b/.gitignore
new file mode 100644
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+build.*/
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1484,14 +1484,14 @@
 
 def AMDGPUFlatWorkGroupSize : InheritableAttr {
   let Spellings = [Clang<"amdgpu_flat_work_group_size", 0>];
-  let Args = [UnsignedArgument<"Min">, UnsignedArgument<"Max">];
+  let Args = [ExprArgument<"Min">, ExprArgument<"Max">];
   let Documentation = [AMDGPUFlatWorkGroupSizeDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
 
 def AMDGPUWavesPerEU : InheritableAttr {
   let Spellings = [Clang<"amdgpu_waves_per_eu", 0>];
-  let Args = [UnsignedArgument<"Min">, UnsignedArgument<"Max", 1>];
+  let Args = [ExprArgument<"Min">, ExprArgument<"Max", 1>];
   let Documentation = [AMDGPUWavesPerEUDocs];
   let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">;
 }
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1048,6 +1048,10 @@
       EK_Decltype, EK_TemplateArgument, EK_Other
     } ExprContext;
 
+    /// If we are checking arguments of a template, this is the template
+    /// under check.
+    TemplateDecl *Template;
+
     ExpressionEvaluationContextRecord(ExpressionEvaluationContext Context,
                                       unsigned NumCleanupObjects,
                                       CleanupInfo ParentCleanup,
@@ -1056,7 +1060,7 @@
         : Context(Context), ParentCleanup(ParentCleanup),
           NumCleanupObjects(NumCleanupObjects), NumTypos(0),
           ManglingContextDecl(ManglingContextDecl), MangleNumbering(),
-          ExprContext(ExprContext) {}
+          ExprContext(ExprContext), Template(nullptr) {}
 
     /// Retrieve the mangling numbering context, used to consistently
     /// number constructs like lambdas for mangling.
@@ -6537,10 +6541,12 @@
 
   bool CheckTemplateArgument(TemplateTypeParmDecl *Param,
                              TypeSourceInfo *Arg);
-  ExprResult CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
-                                   QualType InstantiatedParamType, Expr *Arg,
-                                   TemplateArgument &Converted,
-                               CheckTemplateArgumentKind CTAK = CTAK_Specified);
+  ExprResult
+  CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
+                        QualType InstantiatedParamType, Expr *Arg,
+                        TemplateArgument &Converted,
+                        CheckTemplateArgumentKind CTAK = CTAK_Specified,
+                        TemplateDecl *Template = nullptr);
   bool CheckTemplateTemplateArgument(TemplateParameterList *Params,
                                      TemplateArgumentLoc &Arg);
 
@@ -8659,6 +8665,16 @@
   void AddXConsumedAttr(Decl *D, SourceRange SR, unsigned SpellingIndex,
                         RetainOwnershipKind K, bool IsTemplateInstantiation);
 
+  /// addAMDGPUFlatWorkGroupSizeAttr - Adds an amdgpu_flat_work_group_size
+  /// attribute to a particular declaration.
+  void addAMDGPUFlatWorkGroupSizeAttr(SourceRange AttrRange, Decl *D, Expr *Min,
+                                      Expr *Max, unsigned SpellingListIndex);
+
+  /// addAMDGPUWavePersEUAttr - Adds an amdgpu_waves_per_eu attribute to a
+  /// particular declaration.
+  void addAMDGPUWavesPerEUAttr(SourceRange AttrRange, Decl *D, Expr *Min,
+                               Expr *Max, unsigned SpellingListIndex);
+
   bool checkNSReturnsRetainedReturnType(SourceLocation loc, QualType type);
 
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -7797,8 +7797,16 @@
 
   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
   if (ReqdWGS || FlatWGS) {
-    unsigned Min = FlatWGS ? FlatWGS->getMin() : 0;
-    unsigned Max = FlatWGS ? FlatWGS->getMax() : 0;
+    unsigned Min = 0;
+    unsigned Max = 0;
+    if (FlatWGS) {
+      Min = FlatWGS->getMin()
+                ->EvaluateKnownConstInt(M.getContext())
+                .getExtValue();
+      Max = FlatWGS->getMax()
+                ->EvaluateKnownConstInt(M.getContext())
+                .getExtValue();
+    }
     if (ReqdWGS && Min == 0 && Max == 0)
       Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
 
@@ -7812,8 +7820,12 @@
   }
 
   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>()) {
-    unsigned Min = Attr->getMin();
-    unsigned Max = Attr->getMax();
+    unsigned Min =
+        Attr->getMin()->EvaluateKnownConstInt(M.getContext()).getExtValue();
+    unsigned Max = Attr->getMax() ? Attr->getMax()
+                                        ->EvaluateKnownConstInt(M.getContext())
+                                        .getExtValue()
+                                  : 0;
 
     if (Min != 0) {
       assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -675,9 +675,22 @@
 bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) {
   assert(getLangOpts().CUDA && "Should only be called during CUDA compilation");
   assert(Callee && "Callee may not be null.");
+
+  auto &ExprEvalCtx = ExprEvalContexts.back();
+  if (ExprEvalCtx.isUnevaluated())
+    return true;
+
+  FunctionDecl *Caller = nullptr;
+  if (auto *Template = ExprEvalContexts.back().Template) {
+    if (auto *FD = dyn_cast<FunctionDecl>(Template->getTemplatedDecl()))
+      Caller = FD;
+  } else if (ExprEvalCtx.isConstantEvaluated())
+    return true;
+
   // FIXME: Is bailing out early correct here?  Should we instead assume that
   // the caller is a global initializer?
-  FunctionDecl *Caller = dyn_cast<FunctionDecl>(CurContext);
+  if (!Caller)
+    Caller = dyn_cast<FunctionDecl>(CurContext);
   if (!Caller)
     return true;
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -245,11 +245,11 @@
       !Expr->isIntegerConstantExpr(I, S.Context)) {
     if (Idx != UINT_MAX)
       S.Diag(getAttrLoc(AI), diag::err_attribute_argument_n_type)
-          << AI << Idx << AANT_ArgumentIntegerConstant
+          << &AI << Idx << AANT_ArgumentIntegerConstant
           << Expr->getSourceRange();
     else
       S.Diag(getAttrLoc(AI), diag::err_attribute_argument_type)
-          << AI << AANT_ArgumentIntegerConstant << Expr->getSourceRange();
+          << &AI << AANT_ArgumentIntegerConstant << Expr->getSourceRange();
     return false;
   }
 
@@ -261,7 +261,7 @@
 
   if (StrictlyUnsigned && I.isSigned() && I.isNegative()) {
     S.Diag(getAttrLoc(AI), diag::err_attribute_requires_positive_integer)
-        << AI << /*non-negative*/ 1;
+        << &AI << /*non-negative*/ 1;
     return false;
   }
 
@@ -5853,57 +5853,115 @@
   }
 }
 
-static void handleAMDGPUFlatWorkGroupSizeAttr(Sema &S, Decl *D,
-                                              const ParsedAttr &AL) {
+static bool
+checkAMDGPUFlatWorkGroupSizeArguments(Sema &S, Expr *MinExpr, Expr *MaxExpr,
+                                      const AMDGPUFlatWorkGroupSizeAttr &Attr) {
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (MinExpr->isValueDependent() || MaxExpr->isValueDependent())
+    return false;
+
   uint32_t Min = 0;
-  Expr *MinExpr = AL.getArgAsExpr(0);
-  if (!checkUInt32Argument(S, AL, MinExpr, Min))
-    return;
+  if (!checkUInt32Argument(S, Attr, MinExpr, Min, 0))
+    return true;
 
   uint32_t Max = 0;
-  Expr *MaxExpr = AL.getArgAsExpr(1);
-  if (!checkUInt32Argument(S, AL, MaxExpr, Max))
-    return;
+  if (!checkUInt32Argument(S, Attr, MaxExpr, Max, 1))
+    return true;
 
   if (Min == 0 && Max != 0) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_invalid) << AL << 0;
-    return;
+    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
+        << &Attr << 0;
+    return true;
   }
   if (Min > Max) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_invalid) << AL << 1;
-    return;
+    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
+        << &Attr << 1;
+    return true;
   }
 
-  D->addAttr(::new (S.Context)
-             AMDGPUFlatWorkGroupSizeAttr(AL.getLoc(), S.Context, Min, Max,
-                                         AL.getAttributeSpellingListIndex()));
+  return false;
 }
 
-static void handleAMDGPUWavesPerEUAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
-  uint32_t Min = 0;
-  Expr *MinExpr = AL.getArgAsExpr(0);
-  if (!checkUInt32Argument(S, AL, MinExpr, Min))
+void Sema::addAMDGPUFlatWorkGroupSizeAttr(SourceRange AttrRange, Decl *D,
+                                          Expr *MinExpr, Expr *MaxExpr,
+                                          unsigned SpellingListIndex) {
+  AMDGPUFlatWorkGroupSizeAttr TmpAttr(AttrRange, Context, MinExpr, MaxExpr,
+                                      SpellingListIndex);
+
+  if (checkAMDGPUFlatWorkGroupSizeArguments(*this, MinExpr, MaxExpr, TmpAttr))
     return;
 
+  D->addAttr(::new (Context) AMDGPUFlatWorkGroupSizeAttr(
+      AttrRange, Context, MinExpr, MaxExpr, SpellingListIndex));
+}
+
+static void handleAMDGPUFlatWorkGroupSizeAttr(Sema &S, Decl *D,
+                                              const ParsedAttr &AL) {
+  Expr *MinExpr = AL.getArgAsExpr(0);
+  Expr *MaxExpr = AL.getArgAsExpr(1);
+
+  S.addAMDGPUFlatWorkGroupSizeAttr(AL.getRange(), D, MinExpr, MaxExpr,
+                                   AL.getAttributeSpellingListIndex());
+}
+
+static bool checkAMDGPUWavesPerEUArguments(Sema &S, Expr *MinExpr,
+                                           Expr *MaxExpr,
+                                           const AMDGPUWavesPerEUAttr &Attr) {
+  if (S.DiagnoseUnexpandedParameterPack(MinExpr) ||
+      (MaxExpr && S.DiagnoseUnexpandedParameterPack(MaxExpr)))
+    return true;
+
+  // Accept template arguments for now as they depend on something else.
+  // We'll get to check them when they eventually get instantiated.
+  if (MinExpr->isValueDependent() || (MaxExpr && MaxExpr->isValueDependent()))
+    return false;
+
+  uint32_t Min = 0;
+  if (!checkUInt32Argument(S, Attr, MinExpr, Min, 0))
+    return true;
+
   uint32_t Max = 0;
-  if (AL.getNumArgs() == 2) {
-    Expr *MaxExpr = AL.getArgAsExpr(1);
-    if (!checkUInt32Argument(S, AL, MaxExpr, Max))
-      return;
-  }
+  if (MaxExpr && !checkUInt32Argument(S, Attr, MaxExpr, Max, 1))
+    return true;
 
   if (Min == 0 && Max != 0) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_invalid) << AL << 0;
-    return;
+    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
+        << &Attr << 0;
+    return true;
   }
   if (Max != 0 && Min > Max) {
-    S.Diag(AL.getLoc(), diag::err_attribute_argument_invalid) << AL << 1;
-    return;
+    S.Diag(Attr.getLocation(), diag::err_attribute_argument_invalid)
+        << &Attr << 1;
+    return true;
   }
 
-  D->addAttr(::new (S.Context)
-             AMDGPUWavesPerEUAttr(AL.getLoc(), S.Context, Min, Max,
-                                  AL.getAttributeSpellingListIndex()));
+  return false;
+}
+
+void Sema::addAMDGPUWavesPerEUAttr(SourceRange AttrRange, Decl *D,
+                                   Expr *MinExpr, Expr *MaxExpr,
+                                   unsigned SpellingListIndex) {
+  AMDGPUWavesPerEUAttr TmpAttr(AttrRange, Context, MinExpr, MaxExpr,
+                               SpellingListIndex);
+
+  if (checkAMDGPUWavesPerEUArguments(*this, MinExpr, MaxExpr, TmpAttr))
+    return;
+
+  D->addAttr(::new (Context) AMDGPUWavesPerEUAttr(AttrRange, Context, MinExpr,
+                                                  MaxExpr, SpellingListIndex));
+}
+
+static void handleAMDGPUWavesPerEUAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
+  if (!checkAttributeAtLeastNumArgs(S, AL, 1) ||
+      !checkAttributeAtMostNumArgs(S, AL, 2))
+    return;
+
+  Expr *MinExpr = AL.getArgAsExpr(0);
+  Expr *MaxExpr = (AL.getNumArgs() > 1) ? AL.getArgAsExpr(1) : nullptr;
+
+  S.addAMDGPUWavesPerEUAttr(AL.getRange(), D, MinExpr, MaxExpr,
+                            AL.getAttributeSpellingListIndex());
 }
 
 static void handleAMDGPUNumSGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4555,6 +4555,7 @@
 
   EnterExpressionEvaluationContext ConstantEvaluated(
       SemaRef, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  SemaRef.ExprEvalContexts.back().Template = Template;
   return SemaRef.SubstExpr(Param->getDefaultArgument(), TemplateArgLists);
 }
 
@@ -4805,8 +4806,8 @@
       TemplateArgument Result;
       unsigned CurSFINAEErrors = NumSFINAEErrors;
       ExprResult Res =
-        CheckTemplateArgument(NTTP, NTTPType, Arg.getArgument().getAsExpr(),
-                              Result, CTAK);
+          CheckTemplateArgument(NTTP, NTTPType, Arg.getArgument().getAsExpr(),
+                                Result, CTAK, dyn_cast<TemplateDecl>(Template));
       if (Res.isInvalid())
         return true;
       // If the current template argument causes an error, give up now.
@@ -6175,6 +6176,22 @@
   return true;
 }
 
+namespace {
+FunctionDecl *GetFunctionDecl(Expr *Arg) {
+  Expr *E = Arg;
+  if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E)) {
+    E = UO ? UO->getSubExpr() : nullptr;
+  }
+  if (DeclRefExpr *DRE = dyn_cast_or_null<DeclRefExpr>(E)) {
+    ValueDecl *Entity = DRE ? DRE->getDecl() : nullptr;
+    if (Entity) {
+      if (auto Callee = dyn_cast<FunctionDecl>(Entity))
+        return Callee;
+    }
+  }
+  return nullptr;
+}
+} // namespace
 /// Check a template argument against its corresponding
 /// non-type template parameter.
 ///
@@ -6185,7 +6202,8 @@
 ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
                                        QualType ParamType, Expr *Arg,
                                        TemplateArgument &Converted,
-                                       CheckTemplateArgumentKind CTAK) {
+                                       CheckTemplateArgumentKind CTAK,
+                                       TemplateDecl *Template) {
   SourceLocation StartLoc = Arg->getBeginLoc();
 
   // If the parameter type somehow involves auto, deduce the type now.
@@ -6272,6 +6290,7 @@
   // a constant-evaluated context.
   EnterExpressionEvaluationContext ConstantEvaluated(
       *this, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  ExprEvalContexts.back().Template = Template;
 
   if (getLangOpts().CPlusPlus17) {
     // C++17 [temp.arg.nontype]p1:
@@ -6592,6 +6611,10 @@
         return ExprError();
     }
 
+    if (auto *FD = GetFunctionDecl(Arg))
+      if (getLangOpts().CUDA && !CheckCUDACall(Arg->getBeginLoc(), FD))
+        return ExprError();
+
     if (!ParamType->isMemberPointerType()) {
       if (CheckTemplateArgumentAddressOfObjectOrFunction(*this, Param,
                                                          ParamType,
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -344,6 +344,51 @@
       Attr.getRange());
 }
 
+static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const AMDGPUFlatWorkGroupSizeAttr &Attr, Decl *New) {
+  // Both min and max expression are constant expressions.
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  ExprResult Result = S.SubstExpr(Attr.getMin(), TemplateArgs);
+  if (Result.isInvalid())
+    return;
+  Expr *MinExpr = Result.getAs<Expr>();
+
+  Result = S.SubstExpr(Attr.getMax(), TemplateArgs);
+  if (Result.isInvalid())
+    return;
+  Expr *MaxExpr = Result.getAs<Expr>();
+
+  S.addAMDGPUFlatWorkGroupSizeAttr(Attr.getLocation(), New, MinExpr, MaxExpr,
+                                   Attr.getSpellingListIndex());
+}
+
+static void instantiateDependentAMDGPUWavesPerEUAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const AMDGPUWavesPerEUAttr &Attr, Decl *New) {
+  // Both min and max expression are constant expressions.
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+
+  ExprResult Result = S.SubstExpr(Attr.getMin(), TemplateArgs);
+  if (Result.isInvalid())
+    return;
+  Expr *MinExpr = Result.getAs<Expr>();
+
+  Expr *MaxExpr = nullptr;
+  if (auto Max = Attr.getMax()) {
+    Result = S.SubstExpr(Max, TemplateArgs);
+    if (Result.isInvalid())
+      return;
+    MaxExpr = Result.getAs<Expr>();
+  }
+
+  S.addAMDGPUWavesPerEUAttr(Attr.getLocation(), New, MinExpr, MaxExpr,
+                            Attr.getSpellingListIndex());
+}
+
 void Sema::InstantiateAttrsForDecl(
     const MultiLevelTemplateArgumentList &TemplateArgs, const Decl *Tmpl,
     Decl *New, LateInstantiatedAttrVec *LateAttrs,
@@ -437,6 +482,18 @@
       continue;
     }
 
+    if (const AMDGPUFlatWorkGroupSizeAttr *AMDGPUFlatWorkGroupSize =
+            dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(TmplAttr)) {
+      instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
+          *this, TemplateArgs, *AMDGPUFlatWorkGroupSize, New);
+    }
+
+    if (const AMDGPUWavesPerEUAttr *AMDGPUFlatWorkGroupSize =
+            dyn_cast<AMDGPUWavesPerEUAttr>(TmplAttr)) {
+      instantiateDependentAMDGPUWavesPerEUAttr(*this, TemplateArgs,
+                                               *AMDGPUFlatWorkGroupSize, New);
+    }
+
     // Existing DLL attribute on the instantiation takes precedence.
     if (TmplAttr->getKind() == attr::DLLExport ||
         TmplAttr->getKind() == attr::DLLImport) {
diff --git a/clang/test/SemaCUDA/amdgpu-attrs.cu b/clang/test/SemaCUDA/amdgpu-attrs.cu
--- a/clang/test/SemaCUDA/amdgpu-attrs.cu
+++ b/clang/test/SemaCUDA/amdgpu-attrs.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
 #include "Inputs/cuda.h"
 
 
@@ -78,3 +78,119 @@
 // expected-error@+2{{attribute 'intel_reqd_sub_group_size' can only be applied to an OpenCL kernel function}}
 __attribute__((intel_reqd_sub_group_size(64)))
 __global__ void intel_reqd_sub_group_size_64() {}
+
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size("32", 64)))
+__global__ void non_int_min_flat_work_group_size_32_64() {}
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(32, "64")))
+__global__ void non_int_max_flat_work_group_size_32_64() {}
+
+int nc_min = 32, nc_max = 64;
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(nc_min, 64)))
+__global__ void non_cint_min_flat_work_group_size_32_64() {}
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(32, nc_max)))
+__global__ void non_cint_max_flat_work_group_size_32_64() {}
+
+const int c_min = 16, c_max = 32;
+__attribute__((amdgpu_flat_work_group_size(c_min * 2, 64)))
+__global__ void cint_min_flat_work_group_size_32_64() {}
+__attribute__((amdgpu_flat_work_group_size(32, c_max * 2)))
+__global__ void cint_max_flat_work_group_size_32_64() {}
+
+// expected-error@+3{{'T' does not refer to a value}}
+// expected-note@+1{{declared here}}
+template<typename T>
+__attribute__((amdgpu_flat_work_group_size(T, 64)))
+__global__ void template_class_min_flat_work_group_size_32_64() {}
+// expected-error@+3{{'T' does not refer to a value}}
+// expected-note@+1{{declared here}}
+template<typename T>
+__attribute__((amdgpu_flat_work_group_size(32, T)))
+__global__ void template_class_max_flat_work_group_size_32_64() {}
+
+template<unsigned a, unsigned b>
+__attribute__((amdgpu_flat_work_group_size(a, b)))
+__global__ void template_flat_work_group_size_32_64() {}
+template __global__ void template_flat_work_group_size_32_64<32, 64>();
+
+template<unsigned a, unsigned b, unsigned c>
+__attribute__((amdgpu_flat_work_group_size(a + b, b + c)))
+__global__ void template_complex_flat_work_group_size_32_64() {}
+template __global__ void template_complex_flat_work_group_size_32_64<16, 16, 48>();
+
+unsigned ipow2(unsigned n) { return n == 0 ? 1 : 2 * ipow2(n - 1); }
+constexpr unsigned ce_ipow2(unsigned n) { return n == 0 ? 1 : 2 * ce_ipow2(n - 1); }
+
+__attribute__((amdgpu_flat_work_group_size(ce_ipow2(5), ce_ipow2(6))))
+__global__ void cexpr_flat_work_group_size_32_64() {}
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(ipow2(5), 64)))
+__global__ void non_cexpr_min_flat_work_group_size_32_64() {}
+// expected-error@+1{{'amdgpu_flat_work_group_size' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(32, ipow2(6))))
+__global__ void non_cexpr_max_flat_work_group_size_32_64() {}
+
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu("2")))
+__global__ void non_int_min_waves_per_eu_2() {}
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(2, "4")))
+__global__ void non_int_max_waves_per_eu_2_4() {}
+
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(nc_min)))
+__global__ void non_cint_min_waves_per_eu_2() {}
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(2, nc_max)))
+__global__ void non_cint_min_waves_per_eu_2_4() {}
+
+__attribute__((amdgpu_waves_per_eu(c_min / 8)))
+__global__ void cint_min_waves_per_eu_2() {}
+__attribute__((amdgpu_waves_per_eu(c_min / 8, c_max / 8)))
+__global__ void cint_min_waves_per_eu_2_4() {}
+
+// expected-error@+3{{'T' does not refer to a value}}
+// expected-note@+1{{declared here}}
+template<typename T>
+__attribute__((amdgpu_waves_per_eu(T)))
+__global__ void cint_min_waves_per_eu_2() {}
+// expected-error@+3{{'T' does not refer to a value}}
+// expected-note@+1{{declared here}}
+template<typename T>
+__attribute__((amdgpu_waves_per_eu(2, T)))
+__global__ void cint_min_waves_per_eu_2_4() {}
+
+template<unsigned a>
+__attribute__((amdgpu_waves_per_eu(a)))
+__global__ void template_waves_per_eu_2() {}
+template __global__ void template_waves_per_eu_2<2>();
+
+template<unsigned a, unsigned b>
+__attribute__((amdgpu_waves_per_eu(a, b)))
+__global__ void template_waves_per_eu_2_4() {}
+template __global__ void template_waves_per_eu_2_4<2, 4>();
+
+template<unsigned a, unsigned b, unsigned c>
+__attribute__((amdgpu_waves_per_eu(a + b, c - b)))
+__global__ void template_complex_waves_per_eu_2_4() {}
+template __global__ void template_complex_waves_per_eu_2_4<1, 1, 5>();
+
+// expected-error@+2{{expression contains unexpanded parameter pack 'Args'}}
+template<unsigned... Args>
+__attribute__((amdgpu_waves_per_eu(Args)))
+__global__ void template_waves_per_eu_2() {}
+template __global__ void template_waves_per_eu_2<2, 4>();
+
+__attribute__((amdgpu_waves_per_eu(ce_ipow2(1))))
+__global__ void cexpr_waves_per_eu_2() {}
+__attribute__((amdgpu_waves_per_eu(ce_ipow2(1), ce_ipow2(2))))
+__global__ void cexpr_waves_per_eu_2_4() {}
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(ipow2(1))))
+__global__ void non_cexpr_waves_per_eu_2() {}
+// expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(2, ipow2(2))))
+__global__ void non_cexpr_waves_per_eu_2_4() {}
diff --git a/clang/test/SemaCUDA/kernel-template-with-func-arg.cu b/clang/test/SemaCUDA/kernel-template-with-func-arg.cu
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCUDA/kernel-template-with-func-arg.cu
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+struct C {
+  __device__ void devfun() {}
+  void hostfun() {}
+  template<class T> __device__ void devtempfun() {}
+  __device__ __host__ void devhostfun() {}
+};
+
+__device__ void devfun() {}
+__host__ void hostfun() {}
+template<class T> __device__ void devtempfun() {}
+__device__ __host__ void devhostfun() {}
+
+template <void (*devF)()> __global__ void kernel() { devF();}
+template <typename T, void(T::*devF)()> __global__ void kernel2(T *p) { (p->*devF)(); }
+
+template<> __global__ void kernel<devfun>();
+template<> __global__ void kernel<hostfun>(); // expected-error {{no function template matches function template specialization 'kernel'}}
+                                              // expected-note@-5 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+template<> __global__ void kernel<devtempfun<int> >();
+template<> __global__ void kernel<devhostfun>();
+
+template<> __global__ void kernel<&devfun>();
+template<> __global__ void kernel<&hostfun>(); // expected-error {{no function template matches function template specialization 'kernel'}}
+                                               // expected-note@-11 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+template<> __global__ void kernel<&devtempfun<int> >();
+template<> __global__ void kernel<&devhostfun>();
+
+template<> __global__ void kernel2<C, &C::devfun>(C *p);
+template<> __global__ void kernel2<C, &C::hostfun>(C *p); // expected-error {{no function template matches function template specialization 'kernel2'}}
+                                                          // expected-note@-16 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+template<> __global__ void kernel2<C, &C::devtempfun<int> >(C *p);
+template<> __global__ void kernel2<C, &C::devhostfun>(C *p);
+
+void fun() {
+  kernel<&devfun><<<1,1>>>();
+  kernel<&hostfun><<<1,1>>>(); // expected-error {{no matching function for call to 'kernel'}}
+                               // expected-note@-24 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+  kernel<&devtempfun<int> ><<<1,1>>>();
+  kernel<&devhostfun><<<1,1>>>();
+
+  kernel<devfun><<<1,1>>>();
+  kernel<hostfun><<<1,1>>>(); // expected-error {{no matching function for call to 'kernel'}}
+                              // expected-note@-30 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+  kernel<devtempfun<int> ><<<1,1>>>();
+  kernel<devhostfun><<<1,1>>>();
+
+  C a;
+  kernel2<C, &C::devfun><<<1,1>>>(&a);
+  kernel2<C, &C::hostfun><<<1,1>>>(&a); // expected-error {{no matching function for call to 'kernel2'}}
+                                        // expected-note@-36 {{candidate template ignored: invalid explicitly-specified argument for template parameter 'devF'}}
+  kernel2<C, &C::devtempfun<int> ><<<1,1>>>(&a);
+  kernel2<C, &C::devhostfun><<<1,1>>>(&a);
+}
diff --git a/clang/test/SemaOpenCL/amdgpu-attrs.cl b/clang/test/SemaOpenCL/amdgpu-attrs.cl
--- a/clang/test/SemaOpenCL/amdgpu-attrs.cl
+++ b/clang/test/SemaOpenCL/amdgpu-attrs.cl
@@ -27,12 +27,12 @@
 __attribute__((amdgpu_num_sgpr(32))) void func_num_sgpr_32() {} // expected-error {{'amdgpu_num_sgpr' attribute only applies to kernel functions}}
 __attribute__((amdgpu_num_vgpr(64))) void func_num_vgpr_64() {} // expected-error {{'amdgpu_num_vgpr' attribute only applies to kernel functions}}
 
-__attribute__((amdgpu_flat_work_group_size("ABC", "ABC"))) kernel void kernel_flat_work_group_size_ABC_ABC() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires an integer constant}}
-__attribute__((amdgpu_flat_work_group_size(32, "ABC"))) kernel void kernel_flat_work_group_size_32_ABC() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires an integer constant}}
-__attribute__((amdgpu_flat_work_group_size("ABC", 64))) kernel void kernel_flat_work_group_size_ABC_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires an integer constant}}
-__attribute__((amdgpu_waves_per_eu("ABC"))) kernel void kernel_waves_per_eu_ABC() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires an integer constant}}
-__attribute__((amdgpu_waves_per_eu(2, "ABC"))) kernel void kernel_waves_per_eu_2_ABC() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires an integer constant}}
-__attribute__((amdgpu_waves_per_eu("ABC", 4))) kernel void kernel_waves_per_eu_ABC_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires an integer constant}}
+__attribute__((amdgpu_flat_work_group_size("ABC", "ABC"))) kernel void kernel_flat_work_group_size_ABC_ABC() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size(32, "ABC"))) kernel void kernel_flat_work_group_size_32_ABC() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_flat_work_group_size("ABC", 64))) kernel void kernel_flat_work_group_size_ABC_64() {} // expected-error {{'amdgpu_flat_work_group_size' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu("ABC"))) kernel void kernel_waves_per_eu_ABC() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires parameter 0 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu(2, "ABC"))) kernel void kernel_waves_per_eu_2_ABC() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}}
+__attribute__((amdgpu_waves_per_eu("ABC", 4))) kernel void kernel_waves_per_eu_ABC_4() {} // expected-error {{'amdgpu_waves_per_eu' attribute requires parameter 0 to be an integer constant}}
 __attribute__((amdgpu_num_sgpr("ABC"))) kernel void kernel_num_sgpr_ABC() {} // expected-error {{'amdgpu_num_sgpr' attribute requires an integer constant}}
 __attribute__((amdgpu_num_vgpr("ABC"))) kernel void kernel_num_vgpr_ABC() {} // expected-error {{'amdgpu_num_vgpr' attribute requires an integer constant}}
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -398,8 +398,9 @@
     const TargetRegisterClass *OpRC =
         TLI->isTypeLegal(OpVT) ? TLI->getRegClassFor(OpVT) : nullptr;
     const TargetRegisterClass *IIRC =
-        II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
-           : nullptr;
+        II ? TII->getRegClass(*II, IIOpNum, TRI, *MF) : nullptr;
+    assert(!II || IIOpNum < II->getNumOperands() || !IIRC);
+    IIRC = TRI->getAllocatableClass(IIRC);
 
     if (OpRC && IIRC && OpRC != IIRC &&
         TargetRegisterInfo::isVirtualRegister(VReg)) {