Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -188,6 +188,10 @@
 ModulePass *createR600OpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
+ModulePass *createAMDGPULowerKernelCallsPass();
+void initializeAMDGPULowerKernelCallsPass(PassRegistry&);
+extern char &AMDGPULowerKernelCallsID;
+
 ModulePass *createAMDGPUPrintfRuntimeBinding();
 void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
 extern char &AMDGPUPrintfRuntimeBindingID;
Index: lib/Target/AMDGPU/AMDGPULowerKernelCalls.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPULowerKernelCalls.cpp
@@ -0,0 +1,112 @@
+//===-- AMDGPULowerKernelCalls.cpp - Fix kernel calls ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+//
+// \brief replace calls to OpenCL kernels with equivalent non-kernel
+//        functions
+//
+// In OpenCL, a kernel may call another kernel as if it was a non-kernel
+// function. However, kernels and functions have different ABI. To fix this,
+// we copy the body of kernel A into a new non-kernel function fA, if we
+// encounter a call to A. All calls to A are then transferred to fA.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+namespace {
+class AMDGPULowerKernelCalls : public ModulePass {
+public:
+  static char ID;
+  explicit AMDGPULowerKernelCalls();
+
+  StringRef getPassName() const override {
+    return "AMDGPU Lower Kernel Calls";
+  }
+
+private:
+  bool runOnModule(Module &M) override;
+};
+} // end anonymous namespace
+
+char AMDGPULowerKernelCalls::ID = 0;
+
+namespace llvm {
+void initializeAMDGPULowerKernelCallsPass(PassRegistry &);
+
+ModulePass *createAMDGPULowerKernelCallsPass() {
+  return new AMDGPULowerKernelCalls();
+}
+}
+
+char &llvm::AMDGPULowerKernelCallsID = AMDGPULowerKernelCalls::ID;
+
+INITIALIZE_PASS(
+    AMDGPULowerKernelCalls, "amdgpu-lower-kernel-calls",
+    "Lower calls to kernel functions into non-kernel function calls.", false,
+    false)
+
+AMDGPULowerKernelCalls::AMDGPULowerKernelCalls() : ModulePass(ID) {
+  initializeAMDGPULowerKernelCallsPass(*PassRegistry::getPassRegistry());
+}
+
+static void setNameForBody(Function *FBody, const Function &FKernel) {
+  StringRef Name = FKernel.getName();
+  SmallString<128> NewName("__amdgpu_");
+  NewName += Name;
+  NewName += "_kernel_body";
+
+  FBody->setName(NewName.str());
+}
+
+static Function *cloneKernel(Function &F) {
+  ValueToValueMapTy ignored;
+  Function *NewF = F.empty()
+                       ? Function::Create(
+                             F.getFunctionType(), Function::ExternalLinkage, "",
+                             F.getParent())
+                       : CloneFunction(&F, ignored);
+  NewF->setCallingConv(CallingConv::C);
+  // If we are copying a definition, we know there are no external references
+  // and we can force internal linkage.
+  if (!NewF->isDeclaration()) {
+    NewF->setVisibility(GlobalValue::DefaultVisibility);
+    NewF->setLinkage(GlobalValue::InternalLinkage);
+  }
+  setNameForBody(NewF, F);
+  return NewF;
+}
+
+bool AMDGPULowerKernelCalls::runOnModule(Module &M) {
+  bool Changed = false;
+  for (auto &F : M) {
+    if (CallingConv::AMDGPU_KERNEL != F.getCallingConv())
+      continue;
+    Function *FBody = NULL;
+    for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+         UI != UE;) {
+      CallInst *CI = dyn_cast<CallInst>(*UI++);
+      if (!CI)
+        continue;
+      if (!FBody)
+        FBody = cloneKernel(F);
+      CI->setCalledFunction(FBody);
+      CI->setCallingConv(CallingConv::C);
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -239,6 +239,7 @@
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
+  initializeAMDGPULowerKernelCallsPass(*PR);
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
 }
@@ -667,6 +668,8 @@
   // bitcast calls.
   addPass(createAMDGPUFixFunctionBitcastsPass());
 
+  // this pass should be performed on linked module
+  addPass(createAMDGPULowerKernelCallsPass());
   // A call to propagate attributes pass in the backend in case opt was not run.
   addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
 
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -51,6 +51,7 @@
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
+  AMDGPULowerKernelCalls.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
Index: test/CodeGen/AMDGPU/call-to-kernel-undefined.ll
===================================================================
--- test/CodeGen/AMDGPU/call-to-kernel-undefined.ll
+++ test/CodeGen/AMDGPU/call-to-kernel-undefined.ll
@@ -1,16 +1,21 @@
-; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
 
-; FIXME: It should be invalid IR to have a call to a kernel, but this
-; is currently relied on, but should be eliminated before codegen.
+; GCN: callee_kernel:
+; GCN: s_endpgm
+; GCN: __amdgpu_callee_kernel_kernel_body
+; GCN: s_setpc_b64
 define amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out) #0 {
 entry:
   store volatile i32 0, i32 addrspace(1)* %out
   ret void
 }
 
-; Make sure there's no crash when the callsite calling convention
-; doesn't match.
-; CHECK: LLVM ERROR: invalid call to entry function
+; GCN: caller_kernel:
+; GCN: s_getpc_b64 s{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
+; GCN: s_add_u32 s[[LO2:[0-9]+]], s[[LO1]], __amdgpu_callee_kernel_kernel_body@rel32@lo+4
+; GCN: s_addc_u32 s[[HI2:[0-9]+]], s[[HI1]], __amdgpu_callee_kernel_kernel_body@rel32@hi+4
+; GCN: s_swappc_b64 s[{{[0-9:]+}}], s{{\[}}[[LO2]]:[[HI2]]]
+; GCN: s_endpgm
 define amdgpu_kernel void @caller_kernel(i32 addrspace(1)* %out) #0 {
 entry:
   call void @callee_kernel(i32 addrspace(1)* %out)
Index: test/CodeGen/AMDGPU/call-to-kernel.ll
===================================================================
--- test/CodeGen/AMDGPU/call-to-kernel.ll
+++ test/CodeGen/AMDGPU/call-to-kernel.ll
@@ -1,14 +1,21 @@
-; RUN: not llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
 
-; FIXME: It should be invalid IR to have a call to a kernel, but this
-; is currently relied on, but should be eliminated before codegen.
+; GCN: callee_kernel:
+; GCN: s_endpgm
+; GCN: __amdgpu_callee_kernel_kernel_body
+; GCN: s_setpc_b64
 define amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out) #0 {
 entry:
   store volatile i32 0, i32 addrspace(1)* %out
   ret void
 }
 
-; CHECK: LLVM ERROR: Unsupported calling convention for call
+; GCN: caller_kernel:
+; GCN: s_getpc_b64 s{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]]
+; GCN: s_add_u32 s[[LO2:[0-9]+]], s[[LO1]], __amdgpu_callee_kernel_kernel_body@rel32@lo+4
+; GCN: s_addc_u32 s[[HI2:[0-9]+]], s[[HI1]], __amdgpu_callee_kernel_kernel_body@rel32@hi+4
+; GCN: s_swappc_b64 s[{{[0-9:]+}}], s{{\[}}[[LO2]]:[[HI2]]]
+; GCN: s_endpgm
 define amdgpu_kernel void @caller_kernel(i32 addrspace(1)* %out) #0 {
 entry:
   call amdgpu_kernel void @callee_kernel(i32 addrspace(1)* %out)