This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
clang/
-
lib/
-
CodeGen/
-
CGBuiltin.cpp
-
Headers/
1
avx512bf16intrin.h
1/5
avx512vlbf16intrin.h
-
test/CodeGen/X86/
-
CodeGen/
-
X86/
-
avx512bf16-builtins.c
-
avx512vlbf16-builtins.c

Differential D115611

[X86][BF16] delete `typedef unsigned short __bfloat16`
AbandonedPublic

Authored by FreddyYe on Dec 12 2021, 11:44 PM.

Download Raw Diff

Details

Reviewers

pengfei
LuoYuanke
craig.topper
skan

Summary

The name __bfloat may mislead its real type is bfloat16, but in
fact it's not.

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	60 ms	x64 debian > LLVM.Bindings/Go::go.test
	6,340 ms	x64 debian > libFuzzer.libFuzzer::fork-ubsan.test

Event Timeline

FreddyYe created this revision.Dec 12 2021, 11:44 PM

Herald added a subscriber: pengfei. · View Herald TranscriptDec 12 2021, 11:44 PM

FreddyYe requested review of this revision.Dec 12 2021, 11:44 PM

Herald added a project: Restricted Project. · View Herald TranscriptDec 12 2021, 11:44 PM

Herald added a subscriber: cfe-commits. · View Herald Transcript

FreddyYe added reviewers: pengfei, LuoYuanke, craig.topper, skan.Dec 12 2021, 11:47 PM

Harbormaster completed remote builds in B138908: Diff 393808.Dec 13 2021, 12:14 AM

craig.topper added inline comments.Dec 13 2021, 10:26 AM

clang/lib/Headers/avx512vlbf16intrin.h
416	I'm not sure if this change is a good idea this late. Users could have been dependent on it being an unsigned value before. I believe this changes the behavior of this code int result = _mm_cvtness_sbh(X) Previously it would have zero extended, but now it will sign extend.

FreddyYe added inline comments.Dec 13 2021, 5:39 PM

clang/lib/Headers/avx512vlbf16intrin.h
416	Yes, this should be a huge concern. Notice that intrinsic update has just documented these two intrinsics on 12/7/2021. So maybe we still have change to change it? And it's more theory right to do sign extension from a bfloat16 to int32.

pengfei added inline comments.Dec 13 2021, 6:14 PM

clang/lib/Headers/avx512vlbf16intrin.h
416	I think this is the problem that we choose integer to represent BF16. Neither zero extend nor sign extend makes sense to a floating type. But considering the MSB of floating point is sign bit. Sign extend should be better in theory. Maybe it's a good approach to use `__bf16`, but it is supported only by Clang. We can't use it for intrinsics. Anyway, I'm fine with keeping zero extend here. @craig.topper , do you think it's acceptable to you if we just change `__bfloat16` to `short`?

pengfei added inline comments.Dec 13 2021, 6:21 PM

clang/lib/Headers/avx512vlbf16intrin.h
416	Sorry, I mean `unsigned short`

craig.topper added inline comments.Dec 13 2021, 7:48 PM

clang/lib/Headers/avx512vlbf16intrin.h
416	I'm fine with just changing it to `unsigned short`

changed into unsigned short

clang-format

LGTM

This revision is now accepted and ready to land.Dec 13 2021, 9:49 PM

Harbormaster completed remote builds in B139145: Diff 394137.Dec 13 2021, 10:16 PM

I would suggest we drop the change. Sorry for my fickleness :(

clang/lib/Headers/avx512bf16intrin.h
32	Sorry, when I reviewed the doxygen, I found a new problem. If we want to use `unsigned short`, we may need to change here too. But `unsigned short` is not clear to user since they actually want to convert a bfloat type instead of integer. On the other hand, the double underscore naming conversion is reserved for compiler use and we have already used lots of these terminologies for X86 intrinsics. A much similar one is `__mask16`. So I think using `__bfloat16` here is consistent with the existing types we have defined.

This revision now requires changes to proceed.Dec 13 2021, 10:23 PM

agree with @pengfei . sorry for noise.

Revision Contents

Path

Size

clang/

lib/

CodeGen/

CGBuiltin.cpp

4 lines

Headers/

avx512bf16intrin.h

3 lines

avx512vlbf16intrin.h

2 lines

test/

CodeGen/

X86/

avx512bf16-builtins.c

4 lines

avx512vlbf16-builtins.c

2 lines

Diff 393808

clang/lib/CodeGen/CGBuiltin.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,473 Lines • ▼ Show 20 Lines	static Value *EmitX86CvtF16ToFloatExpr(CodeGenFunction &CGF,
return Res;		return Res;
}		}

// Convert a BF16 to a float.		// Convert a BF16 to a float.
static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF,		static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF,
const CallExpr *E,		const CallExpr *E,
ArrayRef<Value *> Ops) {		ArrayRef<Value *> Ops) {
llvm::Type *Int32Ty = CGF.Builder.getInt32Ty();		llvm::Type *Int32Ty = CGF.Builder.getInt32Ty();
Value *ZeroExt = CGF.Builder.CreateZExt(Ops[0], Int32Ty);		Value *SignExt = CGF.Builder.CreateSExt(Ops[0], Int32Ty);
Value *Shl = CGF.Builder.CreateShl(ZeroExt, 16);		Value *Shl = CGF.Builder.CreateShl(SignExt, 16);
llvm::Type *ResultType = CGF.ConvertType(E->getType());		llvm::Type *ResultType = CGF.ConvertType(E->getType());
Value *BitCast = CGF.Builder.CreateBitCast(Shl, ResultType);		Value *BitCast = CGF.Builder.CreateBitCast(Shl, ResultType);
return BitCast;		return BitCast;
}		}

Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {		Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) {

llvm::Type *Int32Ty = Builder.getInt32Ty();		llvm::Type *Int32Ty = Builder.getInt32Ty();
▲ Show 20 Lines • Show All 6,403 Lines • Show Last 20 Lines

clang/lib/Headers/avx512bf16intrin.h

	Show All 9 Lines
	#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."			#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
	#endif			#endif

	#ifndef __AVX512BF16INTRIN_H			#ifndef __AVX512BF16INTRIN_H
	#define __AVX512BF16INTRIN_H			#define __AVX512BF16INTRIN_H

	typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));			typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
	typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));			typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
	typedef unsigned short __bfloat16;

	#define __DEFAULT_FN_ATTRS512 \			#define __DEFAULT_FN_ATTRS512 \
	__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \			__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
	__min_vector_width__(512)))			__min_vector_width__(512)))
	#define __DEFAULT_FN_ATTRS \			#define __DEFAULT_FN_ATTRS \
	__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))			__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))

	/// Convert One BF16 Data to One Single Float Data.			/// Convert One BF16 Data to One Single Float Data.
	///			///
	/// \headerfile <x86intrin.h>			/// \headerfile <x86intrin.h>
	///			///
	/// This intrinsic does not correspond to a specific instruction.			/// This intrinsic does not correspond to a specific instruction.
	///			///
	/// \param __A			/// \param __A
	/// A bfloat data.			/// A bfloat data.
				pengfeiUnsubmitted Not Done Reply Inline Actions Sorry, when I reviewed the doxygen, I found a new problem. If we want to use `unsigned short`, we may need to change here too. But `unsigned short` is not clear to user since they actually want to convert a bfloat type instead of integer. On the other hand, the double underscore naming conversion is reserved for compiler use and we have already used lots of these terminologies for X86 intrinsics. A much similar one is `__mask16`. So I think using `__bfloat16` here is consistent with the existing types we have defined. pengfei: Sorry, when I reviewed the doxygen, I found a new problem. If we want to use `unsigned short`…
	/// \returns A float data whose sign field and exponent field keep unchanged,			/// \returns A float data whose sign field and exponent field keep unchanged,
	/// and fraction field is extended to 23 bits.			/// and fraction field is extended to 23 bits.
	static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {			static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(short __A) {
	return __builtin_ia32_cvtsbf162ss_32(__A);			return __builtin_ia32_cvtsbf162ss_32(__A);
	}			}

	/// Convert Two Packed Single Data to One Packed BF16 Data.			/// Convert Two Packed Single Data to One Packed BF16 Data.
	///			///
	/// \headerfile <x86intrin.h>			/// \headerfile <x86intrin.h>
	///			///
	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.			/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
	▲ Show 20 Lines • Show All 235 Lines • Show Last 20 Lines

clang/lib/Headers/avx512vlbf16intrin.h

	Show First 20 Lines • Show All 407 Lines • ▼ Show 20 Lines
	/// \headerfile <x86intrin.h>			/// \headerfile <x86intrin.h>
	///			///
	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.			/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
	///			///
	/// \param __A			/// \param __A
	/// A float data.			/// A float data.
	/// \returns A bf16 data whose sign field and exponent field keep unchanged,			/// \returns A bf16 data whose sign field and exponent field keep unchanged,
	/// and fraction field is truncated to 7 bits.			/// and fraction field is truncated to 7 bits.
	static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {			static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
				craig.topperUnsubmitted Not Done Reply Inline Actions I'm not sure if this change is a good idea this late. Users could have been dependent on it being an unsigned value before. I believe this changes the behavior of this code int result = _mm_cvtness_sbh(X) Previously it would have zero extended, but now it will sign extend. craig.topper: I'm not sure if this change is a good idea this late. Users could have been dependent on it…
				FreddyYeAuthorUnsubmitted Done Reply Inline Actions Yes, this should be a huge concern. Notice that intrinsic update has just documented these two intrinsics on 12/7/2021. So maybe we still have change to change it? And it's more theory right to do sign extension from a bfloat16 to int32. FreddyYe: Yes, this should be a huge concern. Notice that intrinsic update has just documented these…
				pengfeiUnsubmitted Not Done Reply Inline Actions I think this is the problem that we choose integer to represent BF16. Neither zero extend nor sign extend makes sense to a floating type. But considering the MSB of floating point is sign bit. Sign extend should be better in theory. Maybe it's a good approach to use `__bf16`, but it is supported only by Clang. We can't use it for intrinsics. Anyway, I'm fine with keeping zero extend here. @craig.topper , do you think it's acceptable to you if we just change `__bfloat16` to `short`? pengfei: I think this is the problem that we choose integer to represent BF16. Neither zero extend nor…
				pengfeiUnsubmitted Not Done Reply Inline Actions Sorry, I mean `unsigned short` pengfei: Sorry, I mean `unsigned short`
				craig.topperUnsubmitted Not Done Reply Inline Actions I'm fine with just changing it to `unsigned short` craig.topper: I'm fine with just changing it to `unsigned short`
	__v4sf __V = {__A, 0, 0, 0};			__v4sf __V = {__A, 0, 0, 0};
	__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(			__v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
	(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);			(__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
	return __R[0];			return __R[0];
	}			}

	/// Convert Packed BF16 Data to Packed float Data.			/// Convert Packed BF16 Data to Packed float Data.
	///			///
	▲ Show 20 Lines • Show All 98 Lines • Show Last 20 Lines

clang/test/CodeGen/X86/avx512bf16-builtins.c

	// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin \			// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin \
	// RUN: -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror \			// RUN: -target-feature +avx512bf16 -emit-llvm -o - -Wall -Werror \
	// RUN: \| FileCheck %s			// RUN: \| FileCheck %s

	#include <immintrin.h>			#include <immintrin.h>

	float test_mm_cvtsbh_ss(__bfloat16 A) {			float test_mm_cvtsbh_ss(short A) {
	// CHECK-LABEL: @test_mm_cvtsbh_ss			// CHECK-LABEL: @test_mm_cvtsbh_ss
	// CHECK: zext i16 %{{.*}} to i32			// CHECK: sext i16 %{{.*}} to i32
	// CHECK: shl i32 %{{.*}}, 16			// CHECK: shl i32 %{{.*}}, 16
	// CHECK: bitcast i32 %{{.*}} to float			// CHECK: bitcast i32 %{{.*}} to float
	// CHECK: ret float %{{.*}}			// CHECK: ret float %{{.*}}
	return _mm_cvtsbh_ss(A);			return _mm_cvtsbh_ss(A);
	}			}

	__m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {			__m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) {
	// CHECK-LABEL: @test_mm512_cvtne2ps_pbh			// CHECK-LABEL: @test_mm512_cvtne2ps_pbh
	▲ Show 20 Lines • Show All 95 Lines • Show Last 20 Lines

clang/test/CodeGen/X86/avx512vlbf16-builtins.c

	Show First 20 Lines • Show All 156 Lines • ▼ Show 20 Lines
	__m256 test_mm256_mask_dpbf16_ps(__m256 D, __m256bh A, __m256bh B, __mmask8 U) {			__m256 test_mm256_mask_dpbf16_ps(__m256 D, __m256bh A, __m256bh B, __mmask8 U) {
	// CHECK-LABEL: @test_mm256_mask_dpbf16_ps			// CHECK-LABEL: @test_mm256_mask_dpbf16_ps
	// CHECK: @llvm.x86.avx512bf16.dpbf16ps.256			// CHECK: @llvm.x86.avx512bf16.dpbf16ps.256
	// CHECK: select <8 x i1> %{{.}}, <8 x float> %{{.}}, <8 x float> %{{.*}}			// CHECK: select <8 x i1> %{{.}}, <8 x float> %{{.}}, <8 x float> %{{.*}}
	// CHECK: ret <8 x float> %{{.*}}			// CHECK: ret <8 x float> %{{.*}}
	return _mm256_mask_dpbf16_ps(D, U, A, B);			return _mm256_mask_dpbf16_ps(D, U, A, B);
	}			}

	__bfloat16 test_mm_cvtness_sbh(float A) {			short test_mm_cvtness_sbh(float A) {
	// CHECK-LABEL: @test_mm_cvtness_sbh			// CHECK-LABEL: @test_mm_cvtness_sbh
	// CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128			// CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128
	// CHECK: ret i16 %{{.*}}			// CHECK: ret i16 %{{.*}}
	return _mm_cvtness_sbh(A);			return _mm_cvtness_sbh(A);
	}			}

	__m128 test_mm_cvtpbh_ps(__m128bh A) {			__m128 test_mm_cvtpbh_ps(__m128bh A) {
	// CHECK-LABEL: @test_mm_cvtpbh_ps			// CHECK-LABEL: @test_mm_cvtpbh_ps
	▲ Show 20 Lines • Show All 55 Lines • Show Last 20 Lines