Index: lib/CodeGen/CGCUDABuiltin.cpp
===================================================================
--- lib/CodeGen/CGCUDABuiltin.cpp
+++ lib/CodeGen/CGCUDABuiltin.cpp
@@ -102,9 +102,15 @@
     // If there are no args, pass a null pointer to vprintf.
     BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx));
   } else {
-    BufferPtr = Builder.Insert(new llvm::AllocaInst(
+    // Insert our alloca not into the current BB, but into the function's entry
+    // block.  This is important because nvvm doesn't support alloca -- if we
+    // put the alloca anywhere else, llvm may eventually output
+    // stacksave/stackrestore intrinsics, which cause ptxas to choke.
+    auto *Alloca = new llvm::AllocaInst(
         llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize),
-        BufAlign, "printf_arg_buf"));
+        BufAlign, "printf_arg_buf");
+    CurFn->getEntryBlock().getInstList().push_front(Alloca);
+    BufferPtr = Alloca;
 
     unsigned Offset = 0;
     for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) {
Index: test/CodeGenCUDA/printf.cu
===================================================================
--- test/CodeGenCUDA/printf.cu
+++ test/CodeGenCUDA/printf.cu
@@ -51,3 +51,14 @@
   // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
   printf("hello, world!");
 }
+
+// Check that printf's alloca happens in the entry block, not inside the if
+// statement.
+__device__ bool foo();
+__device__ void CheckAllocaIsInEntryBlock() {
+  // CHECK: alloca i8, i32 4 align 4
+  // CHECK: call @_Z3foov()
+  if (foo()) {
+    printf("%d", 42);
+  }
+}