Index: lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- lib/CodeGen/CGCUDABuiltin.cpp +++ lib/CodeGen/CGCUDABuiltin.cpp @@ -102,9 +102,15 @@ // If there are no args, pass a null pointer to vprintf. BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); } else { - BufferPtr = Builder.Insert(new llvm::AllocaInst( + // Insert our alloca not into the current BB, but into the function's entry + // block. This is important because nvvm doesn't support alloca -- if we + // put the alloca anywhere else, llvm may eventually output + // stacksave/stackrestore intrinsics, which cause ptxas to choke. + auto *Alloca = new llvm::AllocaInst( llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), - BufAlign, "printf_arg_buf")); + BufAlign, "printf_arg_buf"); + CurFn->getEntryBlock().getInstList().push_front(Alloca); + BufferPtr = Alloca; unsigned Offset = 0; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { Index: test/CodeGenCUDA/printf.cu =================================================================== --- test/CodeGenCUDA/printf.cu +++ test/CodeGenCUDA/printf.cu @@ -51,3 +51,14 @@ // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} printf("hello, world!"); } + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +__device__ bool foo(); +__device__ void CheckAllocaIsInEntryBlock() { + // CHECK: alloca i8, i32 4 align 4 + // CHECK: call @_Z3foov() + if (foo()) { + printf("%d", 42); + } +}