Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -2817,6 +2817,12 @@
         Current = SSE;
       } else
         llvm_unreachable("unexpected long double representation!");
+    } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) {
+      // AMD64 does not support _Float16 or __fp16. When used in
+      // languages supporting it, _Float16 or __fp16 is for storage only,
+      // which is equivalent to 16 bit integer. We need this to interop with
+      // gcc where 16 bit integer is used in place of _Float16 or __fp16.
+      Lo = Integer;
     }
     // FIXME: _Decimal32 and _Decimal64 are SSE.
     // FIXME: _float128 and _Decimal128 are (SSE, SSEUp).
Index: clang/test/CodeGenCUDA/float16.cu
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/float16.cu
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \
+// RUN:   -fhip-new-launch-api | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \
+// RUN:   -fcuda-is-device | FileCheck -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// CHECK: %struct.A = type { i16 }
+struct A {
+  short x;
+};
+
+// CHECK: %struct.B = type { half }
+struct B {
+  _Float16 x;
+};
+
+// CHECK: %struct.C = type { half }
+struct C {
+  __fp16 x;
+};
+
+// Check struct containing _Float16 is coerced and passed correctly to kernel
+// in a similar way as int16.
+
+// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.A, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0
+// CHECK:  store i16 %x.coerce, i16* %coerce.dive, align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.A* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce)
+// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0
+// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2
+__global__ void kern1(A x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.B, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.B* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %add = fadd contract half %[[HALF]], 0xH3C00
+// DEV:  store half %add, half* %x2, align 2
+__global__ void kern2(B x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce)
+// CHECK:  %x = alloca %struct.C, align 2
+// CHECK:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0
+// CHECK:  %[[PTR:.*]] = bitcast half* %coerce.dive to i16*
+// CHECK:  store i16 %x.coerce, i16* %[[PTR]], align 2
+// CHECK:  %kernel_args = alloca i8*, i64 1, align 16
+// CHECK:  %[[PTR:.*]] = bitcast %struct.C* %x to i8*
+// CHECK:  %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+// CHECK:  store i8* %[[PTR]], i8** %[[ARGS]], align 8
+// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce)
+// DEV:  %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0
+// DEV:  store half %x.coerce, half* %coerce.dive, align 2
+// DEV:  %[[HALF:.*]] = load half, half* %x2, align 2
+// DEV:  %conv = fpext half %[[HALF]] to float
+// DEV:  %add = fadd contract float %conv, 1.000000e+00
+// DEV:  %[[HALF:.*]] = fptrunc float %add to half
+// DEV:  store half %[[HALF]], half* %x2, align 2
+__global__ void kern3(C x) {
+  x.x += 1;
+}
+
+// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce)
+void fun1(A x) {
+  kern1<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce)
+void fun2(B x) {
+  kern2<<<1, 1>>>(x);
+}
+
+// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce)
+void func3(C x) {
+  kern3<<<1, 1>>>(x);
+}