Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp
@@ -1963,6 +1963,9 @@
       return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
     break;
   }
+  case Builtin::BIprintf:
+    if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice)
+      return EmitCUDADevicePrintfCallExpr(E, ReturnValue);
   }
 
   // If this is an alias for a lib function (e.g. __builtin_sin), emit
Index: cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp
+++ cfe/trunk/lib/CodeGen/CGCUDABuiltin.cpp
@@ -0,0 +1,131 @@
+//===----- CGCUDABuiltin.cpp - Codegen for CUDA builtins ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Generates code for built-in CUDA calls which are not runtime-specific.
+// (Runtime-specific codegen lives in CGCUDARuntime.)
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+static llvm::Function *GetVprintfDeclaration(llvm::Module &M) {
+  llvm::Type *ArgTypes[] = {llvm::Type::getInt8PtrTy(M.getContext()),
+                            llvm::Type::getInt8PtrTy(M.getContext())};
+  llvm::FunctionType *VprintfFuncType = llvm::FunctionType::get(
+      llvm::Type::getInt32Ty(M.getContext()), ArgTypes, false);
+
+  if (auto* F = M.getFunction("vprintf")) {
+    // Our CUDA system header declares vprintf with the right signature, so
+    // nobody else should have been able to declare vprintf with a bogus
+    // signature.
+    assert(F->getFunctionType() == VprintfFuncType);
+    return F;
+  }
+
+  // vprintf doesn't already exist; create a declaration and insert it into the
+  // module.
+  return llvm::Function::Create(
+      VprintfFuncType, llvm::GlobalVariable::ExternalLinkage, "vprintf", &M);
+}
+
+// Transforms a call to printf into a call to the NVPTX vprintf syscall (which
+// isn't particularly special; it's invoked just like a regular function).
+// vprintf takes two args: A format string, and a pointer to a buffer containing
+// the varargs.
+//
+// For example, the call
+//
+//   printf("format string", arg1, arg2, arg3);
+//
+// is converted into something resembling
+//
+//   char* buf = alloca(...);
+//   *reinterpret_cast<Arg1*>(buf) = arg1;
+//   *reinterpret_cast<Arg2*>(buf + ...) = arg2;
+//   *reinterpret_cast<Arg3*>(buf + ...) = arg3;
+//   vprintf("format string", buf);
+//
+// buf is aligned to the max of {alignof(Arg1), ...}.  Furthermore, each of the
+// args is itself aligned to its preferred alignment.
+//
+// Note that by the time this function runs, E's args have already undergone the
+// standard C vararg promotion (short -> int, float -> double, etc.).
+RValue
+CodeGenFunction::EmitCUDADevicePrintfCallExpr(const CallExpr *E,
+                                              ReturnValueSlot ReturnValue) {
+  assert(getLangOpts().CUDA);
+  assert(getLangOpts().CUDAIsDevice);
+  assert(E->getBuiltinCallee() == Builtin::BIprintf);
+  assert(E->getNumArgs() >= 1); // printf always has at least one arg.
+
+  const llvm::DataLayout &DL = CGM.getDataLayout();
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  CallArgList Args;
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // Figure out how large of a buffer we need to hold our varargs and how
+  // aligned the buffer needs to be.  We start iterating at Arg[1], because
+  // that's our first vararg.
+  unsigned BufSize = 0;
+  unsigned BufAlign = 0;
+  for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
+    const RValue& RV = Args[I].RV;
+    llvm::Type* Ty = RV.getScalarVal()->getType();
+
+    auto Align = DL.getPrefTypeAlignment(Ty);
+    BufAlign = std::max(BufAlign, Align);
+    // Add padding required to keep the current arg aligned.
+    BufSize = llvm::alignTo(BufSize, Align);
+    BufSize += DL.getTypeAllocSize(Ty);
+  }
+
+  // Construct and fill the buffer.
+  llvm::Value* BufferPtr = nullptr;
+  if (BufSize == 0) {
+    // If there are no args, pass a null pointer to vprintf.
+    BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
+  } else {
+    BufferPtr = Builder.Insert(new llvm::AllocaInst(
+        llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize),
+        BufAlign, "printf_arg_buf"));
+
+    unsigned Offset = 0;
+    for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
+      llvm::Value *Arg = Args[I].RV.getScalarVal();
+      llvm::Type *Ty = Arg->getType();
+      auto Align = DL.getPrefTypeAlignment(Ty);
+
+      // Pad the buffer to Arg's alignment.
+      Offset = llvm::alignTo(Offset, Align);
+
+      // Store Arg into the buffer at Offset.
+      llvm::Value *GEP =
+          Builder.CreateGEP(BufferPtr, llvm::ConstantInt::get(Int32Ty, Offset));
+      llvm::Value *Cast = Builder.CreateBitCast(GEP, Ty->getPointerTo());
+      Builder.CreateAlignedStore(Arg, Cast, Align);
+      Offset += DL.getTypeAllocSize(Ty);
+    }
+  }
+
+  // Invoke vprintf and return.
+  llvm::Function* VprintfFunc = GetVprintfDeclaration(CGM.getModule());
+  return RValue::get(
+      Builder.CreateCall(VprintfFunc, {Args[0].RV.getScalarVal(), BufferPtr}));
+}
Index: cfe/trunk/lib/CodeGen/CMakeLists.txt
===================================================================
--- cfe/trunk/lib/CodeGen/CMakeLists.txt
+++ cfe/trunk/lib/CodeGen/CMakeLists.txt
@@ -32,6 +32,7 @@
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
+  CGCUDABuiltin.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
Index: cfe/trunk/lib/CodeGen/CodeGenFunction.h
===================================================================
--- cfe/trunk/lib/CodeGen/CodeGenFunction.h
+++ cfe/trunk/lib/CodeGen/CodeGenFunction.h
@@ -2711,6 +2711,8 @@
   RValue EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
                                 ReturnValueSlot ReturnValue);
 
+  RValue EmitCUDADevicePrintfCallExpr(const CallExpr *E,
+                                      ReturnValueSlot ReturnValue);
 
   RValue EmitBuiltinExpr(const FunctionDecl *FD,
                          unsigned BuiltinID, const CallExpr *E,
Index: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h
===================================================================
--- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h
+++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h
@@ -210,6 +210,11 @@
 static __device__ __attribute__((used)) int __nvvm_reflect_anchor() {
   return __nvvm_reflect("NONE");
 }
+
+// The nvptx vprintf syscall.  This doesn't actually need to be declared, but we
+// declare it so that if someone else declares it with a different signature,
+// we'll throw an error.
+extern "C" __device__ int vprintf(const char*, const char*);
 #endif
 
 #endif // __CUDA__
Index: cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h
===================================================================
--- cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h
+++ cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h
@@ -18,3 +18,5 @@
 
 int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
                       cudaStream_t stream = 0);
+
+extern "C" __device__ int printf(const char*, ...);
Index: cfe/trunk/test/CodeGenCUDA/printf.cu
===================================================================
--- cfe/trunk/test/CodeGenCUDA/printf.cu
+++ cfe/trunk/test/CodeGenCUDA/printf.cu
@@ -0,0 +1,53 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+extern "C" __device__ int vprintf(const char*, const char*);
+
+// Check a simple call to printf end-to-end.
+__device__ int CheckSimple() {
+  // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
+  const char* fmt = "%d";
+  // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4
+  // CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0
+  // CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32*
+  // CHECK: store i32 42, i32* [[CAST]], align 4
+  // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF]])
+  // CHECK: ret i32 [[RET]]
+  return printf(fmt, 42);
+}
+
+// Check that the args' types are promoted correctly when we call printf.
+__device__ void CheckTypes() {
+  // CHECK: alloca {{.*}} align 8
+  // CHECK: getelementptr {{.*}} i32 0
+  // CHECK: bitcast {{.*}} to i32*
+  // CHECK: getelementptr {{.*}} i32 4
+  // CHECK: bitcast {{.*}} to i32*
+  // CHECK: getelementptr {{.*}} i32 8
+  // CHECK: bitcast {{.*}} to double*
+  // CHECK: getelementptr {{.*}} i32 16
+  // CHECK: bitcast {{.*}} to double*
+  printf("%d %d %f %f", (char)1, (short)2, 3.0f, 4.0);
+}
+
+// Check that the args are aligned properly in the buffer.
+__device__ void CheckAlign() {
+  // CHECK: alloca i8, i32 40, align 8
+  // CHECK: getelementptr {{.*}} i32 0
+  // CHECK: getelementptr {{.*}} i32 8
+  // CHECK: getelementptr {{.*}} i32 16
+  // CHECK: getelementptr {{.*}} i32 20
+  // CHECK: getelementptr {{.*}} i32 24
+  // CHECK: getelementptr {{.*}} i32 32
+  printf("%d %f %d %d %d %lld", 1, 2.0, 3, 4, 5, (long long)6);
+}
+
+__device__ void CheckNoArgs() {
+  // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
+  printf("hello, world!");
+}