Index: lib/CodeGen/CGCUDABuiltin.cpp =================================================================== --- lib/CodeGen/CGCUDABuiltin.cpp +++ lib/CodeGen/CGCUDABuiltin.cpp @@ -102,9 +102,15 @@ // If there are no args, pass a null pointer to vprintf. BufferPtr = llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(Ctx)); } else { - BufferPtr = Builder.Insert(new llvm::AllocaInst( + // Insert our alloca not into the current BB, but into the function's entry + // block. This is important because nvvm doesn't support alloca -- if we + // put the alloca anywhere else, llvm may eventually output + // stacksave/stackrestore intrinsics, which cause our nvvm backend to choke. + auto *Alloca = new llvm::AllocaInst( llvm::Type::getInt8Ty(Ctx), llvm::ConstantInt::get(Int32Ty, BufSize), - BufAlign, "printf_arg_buf")); + BufAlign, "printf_arg_buf"); + Alloca->insertAfter(AllocaInsertPt); + BufferPtr = Alloca; unsigned Offset = 0; for (unsigned I = 1, NumArgs = Args.size(); I < NumArgs; ++I) { Index: test/CodeGenCUDA/printf.cu =================================================================== --- test/CodeGenCUDA/printf.cu +++ test/CodeGenCUDA/printf.cu @@ -10,9 +10,9 @@ // Check a simple call to printf end-to-end. __device__ int CheckSimple() { + // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4 // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt const char* fmt = "%d"; - // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca i8, i32 4, align 4 // CHECK: [[PTR:%[0-9]+]] = getelementptr i8, i8* [[BUF]], i32 0 // CHECK: [[CAST:%[0-9]+]] = bitcast i8* [[PTR]] to i32* // CHECK: store i32 42, i32* [[CAST]], align 4 @@ -51,3 +51,14 @@ // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}} printf("hello, world!"); } + +// Check that printf's alloca happens in the entry block, not inside the if +// statement. +__device__ bool foo(); +__device__ void CheckAllocaIsInEntryBlock() { + // CHECK: alloca i8, i32 4, align 4 + // CHECK: call {{.*}} @_Z3foov() + if (foo()) { + printf("%d", 42); + } +}