diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -22,6 +22,7 @@
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetOptions.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/CodeGen/SwiftCallingConv.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -9237,9 +9238,11 @@
     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
   const auto *ReqdWGS =
       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
-  const bool IsOpenCLKernel =
-      M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
+  const bool IsOpenCL = M.getLangOpts().OpenCL;
+  const bool IsOpenCLKernel = IsOpenCL && FD->hasAttr<OpenCLKernelAttr>();
   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
+  const TargetOptions::CodeObjectVersionKind CodeObjectVersion =
+      M.getTarget().getTargetOpts().CodeObjectVersion;
 
   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
   if (ReqdWGS || FlatWGS) {
@@ -9307,6 +9310,14 @@
     if (NumVGPR != 0)
       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
   }
+
+  // OpenCL pre-COV_5 does not support hostcall, so mark all OpenCL functions
+  // as "amdgpu-no-hostcall-ptr" unless we are compiling device-libs (COV_None)
+  // which adopt the COV of the module they are linked with.
+  if (IsOpenCL && CodeObjectVersion != TargetOptions::COV_None &&
+      CodeObjectVersion < TargetOptions::COV_5) {
+    F->addFnAttr("amdgpu-no-hostcall-ptr");
+  }
 }
 
 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
--- a/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-attrs.cl
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu tahiti -O0 -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple amdgcn-- -target-cpu tahiti -O0 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -mcode-object-version=5 -triple amdgcn-amd-amdhsa -target-cpu tahiti -O0 -emit-llvm -o - %s | FileCheck -check-prefix=V5 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -O0 -emit-llvm -verify -o - %s | FileCheck -check-prefix=X86 %s
 
 __attribute__((amdgpu_flat_work_group_size(0, 0))) // expected-no-diagnostics
@@ -161,33 +162,35 @@
 // CHECK-NOT: "amdgpu-num-sgpr"="0"
 // CHECK-NOT: "amdgpu-num-vgpr"="0"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_64_64]] = {{.*}} "amdgpu-flat-work-group-size"="64,64"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_16_128]] = {{.*}} "amdgpu-flat-work-group-size"="16,128"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_64_64]] = {{.*}} "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-hostcall-ptr"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_16_128]] = {{.*}} "amdgpu-flat-work-group-size"="16,128" "amdgpu-no-hostcall-ptr"
 
-// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"  "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr"  "amdgpu-waves-per-eu"="2"
 
-// CHECK-DAG: attributes [[WAVES_PER_EU_2_4]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="2,4"
-// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-sgpr"="32"
-// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2_4]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-waves-per-eu"="2,4"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-vgpr"="64"
-// CHECK-DAG: attributes [[WAVES_PER_EU_2_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[WAVES_PER_EU_2_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[WAVES_PER_EU_2_4_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2,4"
-// CHECK-DAG: attributes [[WAVES_PER_EU_2_4_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
-// CHECK-DAG: attributes [[NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2_4_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[WAVES_PER_EU_2_4_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2,4"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
 
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
-// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2"
+// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64_WAVES_PER_EU_2_4_NUM_SGPR_32_NUM_VGPR_64]] = {{.*}} "amdgpu-flat-work-group-size"="32,64" "amdgpu-no-hostcall-ptr" "amdgpu-num-sgpr"="32" "amdgpu-num-vgpr"="64" "amdgpu-waves-per-eu"="2,4"
 
 // CHECK-DAG: attributes [[A_FUNCTION]] = {{.*}}
-// CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256"
+// CHECK-DAG: attributes [[DEFAULT_KERNEL_ATTRS]] = {{.*}} "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-hostcall-ptr"
+
+// V5-NOT: "amdgpu-no-hostcall-ptr"
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+
+int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+
+// CHECK-LABEL: @test_printf_noargs(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* noundef getelementptr inbounds ([1 x i8], [1 x i8] addrspace(4)* @.str, i64 0, i64 0)) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+__kernel void test_printf_noargs() {
+    printf("");
+}
+
+// CHECK-LABEL: @test_printf_int(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    store i32 [[I:%.*]], i32 addrspace(5)* [[I_ADDR]], align 4, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32 addrspace(5)* [[I_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* noundef getelementptr inbounds ([3 x i8], [3 x i8] addrspace(4)* @.str.1, i64 0, i64 0), i32 noundef [[TMP0]]) #[[ATTR4]]
+// CHECK-NEXT:    ret void
+//
+__kernel void test_printf_int(int i) {
+    printf("%d", i);
+}
+
+// CHECK-LABEL: @test_printf_str_int(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
+// CHECK-NEXT:    store i32 [[I:%.*]], i32 addrspace(5)* [[I_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast [4 x i8] addrspace(5)* [[S]] to i8 addrspace(5)*
+// CHECK-NEXT:    call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* [[TMP0]]) #[[ATTR5:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast [4 x i8] addrspace(5)* [[S]] to i8 addrspace(5)*
+// CHECK-NEXT:    call void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* align 1 [[TMP1]], i8 addrspace(4)* align 1 getelementptr inbounds ([4 x i8], [4 x i8] addrspace(4)* @__const.test_printf_str_int.s, i32 0, i32 0), i64 4, i1 false)
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], [4 x i8] addrspace(5)* [[S]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32 addrspace(5)* [[I_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* noundef getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @.str.2, i64 0, i64 0), i8 addrspace(5)* noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast [4 x i8] addrspace(5)* [[S]] to i8 addrspace(5)*
+// CHECK-NEXT:    call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* [[TMP3]]) #[[ATTR5]]
+// CHECK-NEXT:    ret void
+//
+__kernel void test_printf_str_int(int i) {
+    char s[] = "foo";
+    printf("%s:%d", s, i);
+}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -2826,12 +2826,16 @@
                                                 "HiddenPrintfBuffer"
                                                   A global address space pointer
                                                   to the runtime printf buffer
-                                                  is passed in kernarg.
+                                                  is passed in kernarg. Mutually
+                                                  exclusive with
+                                                  "HiddenHostcallBuffer".
 
                                                 "HiddenHostcallBuffer"
                                                   A global address space pointer
                                                   to the runtime hostcall buffer
-                                                  is passed in kernarg.
+                                                  is passed in kernarg. Mutually
+                                                  exclusive with
+                                                  "HiddenPrintfBuffer".
 
                                                 "HiddenDefaultQueue"
                                                   A global address space pointer
@@ -3351,12 +3355,18 @@
                                                      "hidden_printf_buffer"
                                                        A global address space pointer
                                                        to the runtime printf buffer
-                                                       is passed in kernarg.
+                                                       is passed in kernarg. Mutually
+                                                       exclusive with
+                                                       "hidden_hostcall_buffer"
+                                                       before Code Object V5.
 
                                                      "hidden_hostcall_buffer"
                                                        A global address space pointer
                                                        to the runtime hostcall buffer
-                                                       is passed in kernarg.
+                                                       is passed in kernarg. Mutually
+                                                       exclusive with
+                                                       "hidden_printf_buffer"
+                                                       before Code Object V5.
 
                                                      "hidden_default_queue"
                                                        A global address space pointer
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -401,17 +401,14 @@
   auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
                                       AMDGPUAS::GLOBAL_ADDRESS);
 
-  // Emit "printf buffer" argument if printf is used, otherwise emit dummy
-  // "none" argument.
+  // Emit "printf buffer" argument if printf is used, emit "hostcall buffer"
+  // if "hostcall" module flag is set, otherwise emit dummy "none" argument.
   if (HiddenArgNumBytes >= 32) {
     if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
-    else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr")) {
-      // The printf runtime binding pass should have ensured that hostcall and
-      // printf are not used in the same module.
-      assert(!Func.getParent()->getNamedMetadata("llvm.printf.fmts"));
+    else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
-    } else
+    else
       emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
   }
 
@@ -826,13 +823,10 @@
     if (M->getNamedMetadata("llvm.printf.fmts"))
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
                     Args);
-    else if (MFI.hasHostcallPtr()) {
-      // The printf runtime binding pass should have ensured that hostcall and
-      // printf are not used in the same module.
-      assert(!M->getNamedMetadata("llvm.printf.fmts"));
+    else if (MFI.hasHostcallPtr())
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_hostcall_buffer", Offset,
                     Args);
-    } else
+    else
       emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_none", Offset, Args);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -563,15 +563,6 @@
   if (Printfs.empty())
     return false;
 
-  if (auto HostcallFunction = M.getFunction("__ockl_hostcall_internal")) {
-    for (auto &U : HostcallFunction->uses()) {
-      if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
-        M.getContext().emitError(
-            CI, "Cannot use both printf and hostcall in the same module");
-      }
-    }
-  }
-
   TD = &M.getDataLayout();
 
   return lowerPrintfForGpu(M);
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ir-full-v3.ll
@@ -1894,9 +1894,9 @@
 ; CHECK-NEXT: - 1
 ; CHECK-NEXT: - 0
 
-attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" }
-attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
-attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "calls-enqueue-kernel" }
+attributes #0 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "amdgpu-no-hostcall-ptr" }
+attributes #1 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "amdgpu-no-hostcall-ptr" "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
+attributes #2 = { optnone noinline "amdgpu-implicitarg-num-bytes"="56" "amdgpu-no-hostcall-ptr" "calls-enqueue-kernel" }
 
 !llvm.printf.fmts = !{!100, !101}
 
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-and-hostcall.ll
@@ -0,0 +1,19 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-printf-runtime-binding < %s 2>&1 | FileCheck %s
+
+@.str = private unnamed_addr addrspace(4) constant [6 x i8] c"%s:%d\00", align 1
+
+define amdgpu_kernel void @test_kernel(i32 %n) {
+entry:
+  %str = alloca [9 x i8], align 1, addrspace(5)
+  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
+  %call1 = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @.str, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n)
+  %call2 = call <2 x i64> (i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(i8* undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
+  ret void
+}
+
+declare i32 @printf(i8 addrspace(4)*, ...)
+
+declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
+
+; CHECK-NOT: error:
+; CHECK-NOT: warning:
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll
deleted file mode 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf-no-hostcall.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: not opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-printf-runtime-binding < %s 2>&1 | FileCheck %s
-
-@.str = private unnamed_addr addrspace(2) constant [6 x i8] c"%s:%d\00", align 1
-
-define amdgpu_kernel void @test_kernel(i32 %n) {
-entry:
-  %str = alloca [9 x i8], align 1
-  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8]* %str, i32 0, i32 0
-  %call1 = call i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str, i32 0, i32 0), i8* %arraydecay, i32 %n)
-  %call2 = call <2 x i64> (i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64) @__ockl_hostcall_internal(i8* undef, i32 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9)
-  ret void
-}
-
-declare i32 @printf(i8 addrspace(2)*, ...)
-
-declare <2 x i64> @__ockl_hostcall_internal(i8*, i32, i64, i64, i64, i64, i64, i64, i64, i64)
-
-; CHECK: error: Cannot use both printf and hostcall in the same module