Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -2817,6 +2817,13 @@
         Current = SSE;
       } else
         llvm_unreachable("unexpected long double representation!");
+    } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) {
+      // AMD64 does not support operations on _Float16 or __fp16 other than
+      // load and store. For load/store operations, _Float16 and __fp16 is
+      // equivalent to 16 bit integer since they have the same size and
+      // alignment. We need this to interop with gcc where 16 bit integer
+      // is used in place of _Float16 or __fp16.
+      Lo = Integer;
     }
     // FIXME: _Decimal32 and _Decimal64 are SSE.
     // FIXME: _float128 and _Decimal128 are (SSE, SSEUp).
Index: clang/test/CodeGenCUDA/float16.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/float16.cu
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \
+// RUN:   -fhip-new-launch-api | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \
+// RUN:   -fcuda-is-device | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: %struct.A = type { i16 }
+struct A {
+  short x;
+};
+
+// CHECK: %struct.B = type { half }
+struct B {
+  _Float16 x;
+};
+
+// CHECK: %struct.C = type { half }
+struct C {
+  __fp16 x;
+};
+
+// Check struct containing _Float16 is coerced and passed correctly to kernel
+// in a similar way as int16.
+
+// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.A, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0
+// CHECK:  store i16 %x.coerce, i16* %coerce.dive, align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.A* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0
+// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2
+__global__ void kern1(A x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.B, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.B* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %add = fadd contract half %[[HALF]], 0xH3C00
+// DEV:  store half %add, half* %x2, align 2
+__global__ void kern2(B x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.C, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.C* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %conv = fpext half %[[HALF]] to float
+// DEV:  %add = fadd contract float %conv, 1.000000e+00
+// DEV:  %[[HALF:.*]] = fptrunc float %add to half
+// DEV:  store half %[[HALF]], half* %x2, align 2
+__global__ void kern3(C x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce)
+void fun1(A x) {
+  kern1<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce)
+void fun2(B x) {
+  kern2<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce)
+void func3(C x) {
+  kern3<<<1, 1>>>(x);
+}
Index: clang/test/CodeGenOpenCL/builtins-f16.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-f16.cl
+++ clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -6,66 +6,70 @@
 void test_half_builtins(half h0, half h1, half h2) {
   volatile half res;
 
-  // CHECK: call half @llvm.copysign.f16(half %h0, half %h1)
+  // CHECK: [[h0:%.*]] = bitcast i16 %h0.coerce to half
+  // CHECK: [[h1:%.*]] = bitcast i16 %h1.coerce to half
+  // CHECK: [[h2:%.*]] = bitcast i16 %h2.coerce to half
+
+  // CHECK: call half @llvm.copysign.f16(half [[h0]], half [[h1]])
   res = __builtin_copysignf16(h0, h1);
 
-  // CHECK: call half @llvm.fabs.f16(half %h0)
+  // CHECK: call half @llvm.fabs.f16(half [[h0]])
   res = __builtin_fabsf16(h0);
 
-  // CHECK: call half @llvm.ceil.f16(half %h0)
+  // CHECK: call half @llvm.ceil.f16(half [[h0]])
   res = __builtin_ceilf16(h0);
 
-  // CHECK: call half @llvm.cos.f16(half %h0)
+  // CHECK: call half @llvm.cos.f16(half [[h0]])
   res = __builtin_cosf16(h0);
 
-  // CHECK: call half @llvm.exp.f16(half %h0)
+  // CHECK: call half @llvm.exp.f16(half [[h0]])
   res = __builtin_expf16(h0);
 
-  // CHECK: call half @llvm.exp2.f16(half %h0)
+  // CHECK: call half @llvm.exp2.f16(half [[h0]])
   res = __builtin_exp2f16(h0);
 
-  // CHECK: call half @llvm.floor.f16(half %h0)
+  // CHECK: call half @llvm.floor.f16(half [[h0]])
   res = __builtin_floorf16(h0);
 
-  // CHECK: call half @llvm.fma.f16(half %h0, half %h1, half %h2)
+  // CHECK: call half @llvm.fma.f16(half [[h0]], half [[h1]], half [[h2]])
   res = __builtin_fmaf16(h0, h1 ,h2);
 
-  // CHECK: call half @llvm.maxnum.f16(half %h0, half %h1)
+  // CHECK: call half @llvm.maxnum.f16(half [[h0]], half [[h1]])
   res = __builtin_fmaxf16(h0, h1);
 
-  // CHECK: call half @llvm.minnum.f16(half %h0, half %h1)
+  // CHECK: call half @llvm.minnum.f16(half [[h0]], half [[h1]])
   res = __builtin_fminf16(h0, h1);
 
-  // CHECK: frem half %h0, %h1
+  // CHECK: frem half [[h0]], [[h1]]
   res = __builtin_fmodf16(h0, h1);
 
-  // CHECK: call half @llvm.pow.f16(half %h0, half %h1)
+  // CHECK: call half @llvm.pow.f16(half [[h0]], half [[h1]])
   res = __builtin_powf16(h0, h1);
 
-  // CHECK: call half @llvm.log10.f16(half %h0)
+  // CHECK: call half @llvm.log10.f16(half [[h0]])
   res = __builtin_log10f16(h0);
 
-  // CHECK: call half @llvm.log2.f16(half %h0)
+  // CHECK: call half @llvm.log2.f16(half [[h0]])
   res = __builtin_log2f16(h0);
 
-  // CHECK: call half @llvm.log.f16(half %h0)
+  // CHECK: call half @llvm.log.f16(half [[h0]])
   res = __builtin_logf16(h0);
 
-  // CHECK: call half @llvm.rint.f16(half %h0)
+  // CHECK: call half @llvm.rint.f16(half [[h0]])
   res = __builtin_rintf16(h0);
 
-  // CHECK: call half @llvm.round.f16(half %h0)
+  // CHECK: call half @llvm.round.f16(half [[h0]])
   res = __builtin_roundf16(h0);
 
-  // CHECK: call half @llvm.sin.f16(half %h0)
+  // CHECK: call half @llvm.sin.f16(half [[h0]])
   res = __builtin_sinf16(h0);
 
-  // CHECK: call half @llvm.sqrt.f16(half %h0)
+  // CHECK: call half @llvm.sqrt.f16(half [[h0]])
   res = __builtin_sqrtf16(h0);
 
-  // CHECK: call half @llvm.trunc.f16(half %h0)
+  // CHECK: call half @llvm.trunc.f16(half [[h0]])
   res = __builtin_truncf16(h0);
 
-  // CHECK: call half @llvm.canonicalize.f16(half %h0)
+  // CHECK: call half @llvm.canonicalize.f16(half [[h0]])
   res = __builtin_canonicalizef16(h0);
 }