Index: clang/lib/CodeGen/TargetInfo.cpp =================================================================== --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -2817,6 +2817,13 @@ Current = SSE; } else llvm_unreachable("unexpected long double representation!"); + } else if (k == BuiltinType::Float16 || k == BuiltinType::Half) { + // AMD64 does not support operations on _Float16 or __fp16 other than + // load and store. For load/store operations, _Float16 and __fp16 is + // equivalent to 16 bit integer since they have the same size and + // alignment. We need this to interop with gcc where 16 bit integer + // is used in place of _Float16 or __fp16. + Lo = Integer; } // FIXME: _Decimal32 and _Decimal64 are SSE. // FIXME: _float128 and _Decimal128 are (SSE, SSEUp). Index: clang/test/CodeGenCUDA/float16.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/float16.cu @@ -0,0 +1,94 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - -x hip %s \ +// RUN: -fhip-new-launch-api | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -emit-llvm -o - -x hip %s \ +// RUN: -fcuda-is-device | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// CHECK: %struct.A = type { i16 } +struct A { + short x; +}; + +// CHECK: %struct.B = type { half } +struct B { + _Float16 x; +}; + +// CHECK: %struct.C = type { half } +struct C { + __fp16 x; +}; + +// Check struct containing _Float16 is coerced and passed correctly to kernel +// in a similar way as int16. + +// CHECK: define dso_local void @_Z20__device_stub__kern11A(i16 %x.coerce) +// CHECK: %x = alloca %struct.A, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x, i32 0, i32 0 +// CHECK: store i16 %x.coerce, i16* %coerce.dive, align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.A* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern11A(i16 %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.A, %struct.A* %x1, i32 0, i32 0 +// DEV: store i16 %x.coerce, i16* %coerce.dive, align 2 +__global__ void kern1(A x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern21B(i16 %x.coerce) +// CHECK: %x = alloca %struct.B, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.B* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern21B(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.B, %struct.B* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %add = fadd contract half %[[HALF]], 0xH3C00 +// DEV: store half %add, half* %x2, align 2 +__global__ void kern2(B x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z20__device_stub__kern31C(i16 %x.coerce) +// CHECK: %x = alloca %struct.C, align 2 +// CHECK: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x, i32 0, i32 0 +// CHECK: %[[PTR:.*]] = bitcast half* %coerce.dive to i16* +// CHECK: store i16 %x.coerce, i16* %[[PTR]], align 2 +// CHECK: %kernel_args = alloca i8*, i64 1, align 16 +// CHECK: %[[PTR:.*]] = bitcast %struct.C* %x to i8* +// CHECK: %[[ARGS:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 +// CHECK: store i8* %[[PTR]], i8** %[[ARGS]], align 8 +// DEV: define dso_local amdgpu_kernel void @_Z5kern31C(half %x.coerce) +// DEV: %coerce.dive = getelementptr inbounds %struct.C, %struct.C* %x1, i32 0, i32 0 +// DEV: store half %x.coerce, half* %coerce.dive, align 2 +// DEV: %[[HALF:.*]] = load half, half* %x2, align 2 +// DEV: %conv = fpext half %[[HALF]] to float +// DEV: %add = fadd contract float %conv, 1.000000e+00 +// DEV: %[[HALF:.*]] = fptrunc float %add to half +// DEV: store half %[[HALF]], half* %x2, align 2 +__global__ void kern3(C x) { + x.x += 1; +} + +// CHECK: define dso_local void @_Z4fun11A(i16 %x.coerce) +void fun1(A x) { + kern1<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z4fun21B(i16 %x.coerce) +void fun2(B x) { + kern2<<<1, 1>>>(x); +} + +// CHECK: define dso_local void @_Z5func31C(i16 %x.coerce) +void func3(C x) { + kern3<<<1, 1>>>(x); +} Index: clang/test/CodeGenOpenCL/builtins-f16.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-f16.cl +++ clang/test/CodeGenOpenCL/builtins-f16.cl @@ -6,66 +6,70 @@ void test_half_builtins(half h0, half h1, half h2) { volatile half res; - // CHECK: call half @llvm.copysign.f16(half %h0, half %h1) + // CHECK: [[h0:%.*]] = bitcast i16 %h0.coerce to half + // CHECK: [[h1:%.*]] = bitcast i16 %h1.coerce to half + // CHECK: [[h2:%.*]] = bitcast i16 %h2.coerce to half + + // CHECK: call half @llvm.copysign.f16(half [[h0]], half [[h1]]) res = __builtin_copysignf16(h0, h1); - // CHECK: call half @llvm.fabs.f16(half %h0) + // CHECK: call half @llvm.fabs.f16(half [[h0]]) res = __builtin_fabsf16(h0); - // CHECK: call half @llvm.ceil.f16(half %h0) + // CHECK: call half @llvm.ceil.f16(half [[h0]]) res = __builtin_ceilf16(h0); - // CHECK: call half @llvm.cos.f16(half %h0) + // CHECK: call half @llvm.cos.f16(half [[h0]]) res = __builtin_cosf16(h0); - // CHECK: call half @llvm.exp.f16(half %h0) + // CHECK: call half @llvm.exp.f16(half [[h0]]) res = __builtin_expf16(h0); - // CHECK: call half @llvm.exp2.f16(half %h0) + // CHECK: call half @llvm.exp2.f16(half [[h0]]) res = __builtin_exp2f16(h0); - // CHECK: call half @llvm.floor.f16(half %h0) + // CHECK: call half @llvm.floor.f16(half [[h0]]) res = __builtin_floorf16(h0); - // CHECK: call half @llvm.fma.f16(half %h0, half %h1, half %h2) + // CHECK: call half @llvm.fma.f16(half [[h0]], half [[h1]], half [[h2]]) res = __builtin_fmaf16(h0, h1 ,h2); - // CHECK: call half @llvm.maxnum.f16(half %h0, half %h1) + // CHECK: call half @llvm.maxnum.f16(half [[h0]], half [[h1]]) res = __builtin_fmaxf16(h0, h1); - // CHECK: call half @llvm.minnum.f16(half %h0, half %h1) + // CHECK: call half @llvm.minnum.f16(half [[h0]], half [[h1]]) res = __builtin_fminf16(h0, h1); - // CHECK: frem half %h0, %h1 + // CHECK: frem half [[h0]], [[h1]] res = __builtin_fmodf16(h0, h1); - // CHECK: call half @llvm.pow.f16(half %h0, half %h1) + // CHECK: call half @llvm.pow.f16(half [[h0]], half [[h1]]) res = __builtin_powf16(h0, h1); - // CHECK: call half @llvm.log10.f16(half %h0) + // CHECK: call half @llvm.log10.f16(half [[h0]]) res = __builtin_log10f16(h0); - // CHECK: call half @llvm.log2.f16(half %h0) + // CHECK: call half @llvm.log2.f16(half [[h0]]) res = __builtin_log2f16(h0); - // CHECK: call half @llvm.log.f16(half %h0) + // CHECK: call half @llvm.log.f16(half [[h0]]) res = __builtin_logf16(h0); - // CHECK: call half @llvm.rint.f16(half %h0) + // CHECK: call half @llvm.rint.f16(half [[h0]]) res = __builtin_rintf16(h0); - // CHECK: call half @llvm.round.f16(half %h0) + // CHECK: call half @llvm.round.f16(half [[h0]]) res = __builtin_roundf16(h0); - // CHECK: call half @llvm.sin.f16(half %h0) + // CHECK: call half @llvm.sin.f16(half [[h0]]) res = __builtin_sinf16(h0); - // CHECK: call half @llvm.sqrt.f16(half %h0) + // CHECK: call half @llvm.sqrt.f16(half [[h0]]) res = __builtin_sqrtf16(h0); - // CHECK: call half @llvm.trunc.f16(half %h0) + // CHECK: call half @llvm.trunc.f16(half [[h0]]) res = __builtin_truncf16(h0); - // CHECK: call half @llvm.canonicalize.f16(half %h0) + // CHECK: call half @llvm.canonicalize.f16(half [[h0]]) res = __builtin_canonicalizef16(h0); }