Index: lib/CodeGen/TargetInfo.cpp
===================================================================
--- lib/CodeGen/TargetInfo.cpp
+++ lib/CodeGen/TargetInfo.cpp
@@ -4762,7 +4762,9 @@
   void SetTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
                            CodeGen::CodeGenModule &M) const override;
 private:
-  static void addKernelMetadata(llvm::Function *F);
+  // Adds a NamedMDNode with F, Name, and Operand as operands, and adds the
+  // resulting MDNode to the nvvm.annotations MDNode.
+  static void addNVVMMetadata(llvm::Function *F, StringRef Name, int Operand);
 };
 
 ABIArgInfo NVPTXABIInfo::classifyReturnType(QualType RetTy) const {
@@ -4821,7 +4823,8 @@
     // By default, all functions are device functions
     if (FD->hasAttr<OpenCLKernelAttr>()) {
       // OpenCL __kernel functions get kernel metadata
-      addKernelMetadata(F);
+      // Create !{<func-ref>, metadata !"kernel", i32 1} node
+      addNVVMMetadata(F, "kernel", 1);
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
@@ -4832,28 +4835,43 @@
     // CUDA __global__ functions get a kernel metadata entry.  Since
     // __global__ functions cannot be called from the device, we do not
     // need to set the noinline attribute.
-    if (FD->hasAttr<CUDAGlobalAttr>())
-      addKernelMetadata(F);
+    if (FD->hasAttr<CUDAGlobalAttr>()) {
+      // Create !{<func-ref>, metadata !"kernel", i32 1} node
+      addNVVMMetadata(F, "kernel", 1);
+    }
+    if (FD->hasAttr<CUDALaunchBoundsAttr>()) {
+      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
+      addNVVMMetadata(F, "maxntidx",
+                      FD->getAttr<CUDALaunchBoundsAttr>()->getMaxThreads());
+      // min blocks is a default argument for CUDALaunchBoundsAttr, so getting a
+      // zero value from getMinBlocks either means it was not specified in
+      // __launch_bounds__ or the user specified a 0 value. In both cases, we
+      // don't have to add a PTX directive.
+      int MinCTASM = FD->getAttr<CUDALaunchBoundsAttr>()->getMinBlocks();
+      if (MinCTASM > 0) {
+        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
+        addNVVMMetadata(F, "minctasm", MinCTASM);
+      }
+    }
   }
 }
 
-void NVPTXTargetCodeGenInfo::addKernelMetadata(llvm::Function *F) {
+void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::Function *F, StringRef Name,
+                                             int Operand) {
   llvm::Module *M = F->getParent();
   llvm::LLVMContext &Ctx = M->getContext();
 
   // Get "nvvm.annotations" metadata node
   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
 
-  // Create !{<func-ref>, metadata !"kernel", i32 1} node
   llvm::SmallVector<llvm::Value *, 3> MDVals;
   MDVals.push_back(F);
-  MDVals.push_back(llvm::MDString::get(Ctx, "kernel"));
-  MDVals.push_back(llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1));
-
+  MDVals.push_back(llvm::MDString::get(Ctx, Name));
+  MDVals.push_back(
+      llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand));
   // Append metadata to nvvm.annotations
   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
 }
-
 }
 
 //===----------------------------------------------------------------------===//
Index: test/CodeGenCUDA/launch-bounds.cu
===================================================================
--- /dev/null
+++ test/CodeGenCUDA/launch-bounds.cu
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -emit-llvm -o - | FileCheck %s
+
+#include "../SemaCUDA/cuda.h"
+
+#define MAX_THREADS_PER_BLOCK 256
+#define MIN_BLOCKS_PER_MP     2
+
+// Test both max threads per block and Min cta per sm.
+extern "C" {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP )
+Kernel1()
+{
+}
+}
+
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"maxntidx", i32 256}
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel1, metadata !"minctasm", i32 2}
+
+// Test only max threads per block. Min cta per sm defaults to 0, and
+// CodeGen doesn't output a zero value for minctasm.
+extern "C" {
+__global__ void
+__launch_bounds__( MAX_THREADS_PER_BLOCK )
+Kernel2()
+{
+}
+}
+
+// CHECK: !{{[0-9]+}} = metadata !{void ()* @Kernel2, metadata !"maxntidx", i32 256}