Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -2817,6 +2817,12 @@ Current = SSE; } else llvm_unreachable("unexpected long double representation!"); + } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) { + // AMD64 does not support _Float16 or __fp16. When used in + // languages supporting it, _Float16 or __fp16 is for storage only, + // which is equivalent to 16 bit integer. We need this to interop with + // gcc where 16 bit integer is used in place of _Float16 or __fp16. + Lo = Integer; } // FIXME: _Decimal32 and _Decimal64 are SSE. // FIXME: _float128 and _Decimal128 are (SSE, SSEUp). Index: clang/test/CodeGenCUDA/float16.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/float16.cu @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \ +// RUN: -fhip-new-launch-api | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \ +// RUN: -fcuda-is-device | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// CHECK: %struct.A = type { i16 } +struct A { + short x; +}; + +// CHECK: %struct.B = type { half } +struct B { + _Float16 x; +}; + +// CHECK: %struct.C = type { half } +struct C { + __fp16 x; +}; + +// Check struct containing _Float16 is coerced and passed correctly to kernel +// in a similar way as int16. + +// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce) +// CHECK: %x = alloca %struct.A, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0 +// CHECK: store i16 %x.coerce, i16* %coerce.dive, align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.A* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0 +// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2 +__global__ void kern1(A x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce) +// CHECK: %x = alloca %struct.B, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.B* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %add = fadd contract half %[[HALF]], 0xH3C00 +// DEV: store half %add, half* %x2, align 2 +__global__ void kern2(B x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce) +// CHECK: %x = alloca %struct.C, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.C* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %conv = fpext half %[[HALF]] to float +// DEV: %add = fadd contract float %conv, 1.000000e+00 +// DEV: %[[HALF:.*]] = fptrunc float %add to half +// DEV: store half %[[HALF]], half* %x2, align 2 +__global__ void kern3(C x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce) +void fun1(A x) { + kern1<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce) +void fun2(B x) { + kern2<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce) +void func3(C x) { + kern3<<<1, 1>>>(x); +}