diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1749,16 +1749,16 @@ TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256, "V32cV32cV32c", "ncV:256:", "avx512vbmi,avx512vl") // bf16 intrinsics -TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_128, "V8sV4fV4f", "ncV:128:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_256, "V16sV8fV8f", "ncV:256:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_512, "V32sV16fV16f", "ncV:512:", "avx512bf16") -TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_128_mask, "V8sV4fV8sUc", "ncV:128:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_256_mask, "V8sV8fV8sUc", "ncV:256:", "avx512bf16,avx512vl") -TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_512_mask, "V16sV16fV16sUs", "ncV:512:", "avx512bf16") +TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_128, "V8yV4fV4f", "ncV:128:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_256, "V16yV8fV8f", "ncV:256:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cvtne2ps2bf16_512, "V32yV16fV16f", "ncV:512:", "avx512bf16") +TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_128_mask, "V8yV4fV8yUc", "ncV:128:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_256_mask, "V8yV8fV8yUc", "ncV:256:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_cvtneps2bf16_512_mask, "V16yV16fV16yUs", "ncV:512:", "avx512bf16") TARGET_BUILTIN(__builtin_ia32_dpbf16ps_128, "V4fV4fV4iV4i", "ncV:128:", "avx512bf16,avx512vl") TARGET_BUILTIN(__builtin_ia32_dpbf16ps_256, "V8fV8fV8iV8i", "ncV:256:", "avx512bf16,avx512vl") TARGET_BUILTIN(__builtin_ia32_dpbf16ps_512, "V16fV16fV16iV16i", "ncV:512:", "avx512bf16") -TARGET_BUILTIN(__builtin_ia32_cvtsbf162ss_32, "fUs", "nc", "avx512bf16") +TARGET_BUILTIN(__builtin_ia32_cvtsbf162ss_32, "fy", "nc", "avx512bf16") TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_512, "vV8OiV8OiUc*Uc*", "nV:512:", "avx512vp2intersect") TARGET_BUILTIN(__builtin_ia32_vp2intersect_q_256, "vV4OiV4OiUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl") @@ -1977,6 +1977,9 @@ TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_selectpbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_selectpbf_256, "V16yUsV16yV16y", "ncV:256:", "avx512bf16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_selectpbf_512, "V32yUiV32yV32y", "ncV:512:", "avx512bf16") TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f") @@ -1987,6 +1990,7 @@ TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f") TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_selectsbf_128, "V8yUcV8yV8y", "ncV:128:", "avx512bf16") TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12667,18 +12667,6 @@ return Res; } -// Convert a BF16 to a float. -static Value *EmitX86CvtBF16ToFloatExpr(CodeGenFunction &CGF, - const CallExpr *E, - ArrayRef Ops) { - llvm::Type *Int32Ty = CGF.Builder.getInt32Ty(); - Value *ZeroExt = CGF.Builder.CreateZExt(Ops[0], Int32Ty); - Value *Shl = CGF.Builder.CreateShl(ZeroExt, 16); - llvm::Type *ResultType = CGF.ConvertType(E->getType()); - Value *BitCast = CGF.Builder.CreateBitCast(Shl, ResultType); - return BitCast; -} - Value *CodeGenFunction::EmitX86CpuIs(StringRef CPUStr) { llvm::Type *Int32Ty = Builder.getInt32Ty(); @@ -14085,6 +14073,9 @@ case X86::BI__builtin_ia32_selectph_128: case X86::BI__builtin_ia32_selectph_256: case X86::BI__builtin_ia32_selectph_512: + case X86::BI__builtin_ia32_selectpbf_128: + case X86::BI__builtin_ia32_selectpbf_256: + case X86::BI__builtin_ia32_selectpbf_512: case X86::BI__builtin_ia32_selectps_128: case X86::BI__builtin_ia32_selectps_256: case X86::BI__builtin_ia32_selectps_512: @@ -14093,6 +14084,7 @@ case X86::BI__builtin_ia32_selectpd_512: return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); case X86::BI__builtin_ia32_selectsh_128: + case X86::BI__builtin_ia32_selectsbf_128: case X86::BI__builtin_ia32_selectss_128: case X86::BI__builtin_ia32_selectsd_128: { Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); @@ -14929,7 +14921,7 @@ return EmitX86CvtF16ToFloatExpr(*this, Ops, ConvertType(E->getType())); } -// AVX512 bf16 intrinsics + // AVX512 bf16 intrinsics case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { Ops[2] = getMaskVecValue( *this, Ops[2], @@ -14938,7 +14930,7 @@ return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } case X86::BI__builtin_ia32_cvtsbf162ss_32: - return EmitX86CvtBF16ToFloatExpr(*this, E, Ops); + return Builder.CreateFPExt(Ops[0], Builder.getFloatTy()); case X86::BI__builtin_ia32_cvtneps2bf16_256_mask: case X86::BI__builtin_ia32_cvtneps2bf16_512_mask: { diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h --- a/clang/lib/Headers/avx512bf16intrin.h +++ b/clang/lib/Headers/avx512bf16intrin.h @@ -10,12 +10,16 @@ #error "Never use directly; include instead." #endif +#ifdef __SSE2__ + #ifndef __AVX512BF16INTRIN_H #define __AVX512BF16INTRIN_H -typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64))); -typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32))); -typedef unsigned short __bfloat16; +typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); +typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); +typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32))); +typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); +typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead"))); #define __DEFAULT_FN_ATTRS512 \ __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \ @@ -33,7 +37,7 @@ /// A bfloat data. /// \returns A float data whose sign field and exponent field keep unchanged, /// and fraction field is extended to 23 bits. -static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) { +static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) { return __builtin_ia32_cvtsbf162ss_32(__A); } @@ -74,9 +78,9 @@ /// conversion of __B, and higher 256 bits come from conversion of __A. static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), - (__v32hi)__W); + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, + (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), + (__v32bf)__W); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -96,9 +100,9 @@ /// conversion of __B, and higher 256 bits come from conversion of __A. static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), - (__v32hi)_mm512_setzero_si512()); + return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U, + (__v32bf)_mm512_cvtne2ps_pbh(__A, __B), + (__v32bf)_mm512_setzero_si512()); } /// Convert Packed Single Data to Packed BF16 Data. @@ -113,7 +117,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS512 _mm512_cvtneps_pbh(__m512 __A) { return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)_mm256_undefined_si256(), + (__v16bf)_mm256_undefined_si256(), (__mmask16)-1); } @@ -134,7 +138,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS512 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)__W, + (__v16bf)__W, (__mmask16)__U); } @@ -153,7 +157,7 @@ static __inline__ __m256bh __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)_mm256_setzero_si256(), + (__v16bf)_mm256_setzero_si256(), (__mmask16)__U); } @@ -277,3 +281,4 @@ #undef __DEFAULT_FN_ATTRS512 #endif +#endif diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h --- a/clang/lib/Headers/avx512vlbf16intrin.h +++ b/clang/lib/Headers/avx512vlbf16intrin.h @@ -10,10 +10,13 @@ #error "Never use directly; include instead." #endif +#ifdef __SSE2__ + #ifndef __AVX512VLBF16INTRIN_H #define __AVX512VLBF16INTRIN_H -typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16))); +typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); +typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ @@ -59,9 +62,9 @@ /// conversion of __B, and higher 64 bits come from conversion of __A. static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtne2ps_pbh(__A, __B), - (__v8hi)__W); + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_cvtne2ps_pbh(__A, __B), + (__v8bf)__W); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -81,9 +84,9 @@ /// conversion of __B, and higher 64 bits come from conversion of __A. static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtne2ps_pbh(__A, __B), - (__v8hi)_mm_setzero_si128()); + return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U, + (__v8bf)_mm_cvtne2ps_pbh(__A, __B), + (__v8bf)_mm_setzero_si128()); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -123,9 +126,9 @@ /// conversion of __B, and higher 128 bits come from conversion of __A. static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), - (__v16hi)__W); + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, + (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), + (__v16bf)__W); } /// Convert Two Packed Single Data to One Packed BF16 Data. @@ -145,9 +148,9 @@ /// conversion of __B, and higher 128 bits come from conversion of __A. static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), - (__v16hi)_mm256_setzero_si256()); + return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U, + (__v16bf)_mm256_cvtne2ps_pbh(__A, __B), + (__v16bf)_mm256_setzero_si256()); } /// Convert Packed Single Data to Packed BF16 Data. @@ -163,7 +166,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_cvtneps_pbh(__m128 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)_mm_undefined_si128(), + (__v8bf)_mm_undefined_si128(), (__mmask8)-1); } @@ -185,7 +188,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)__W, + (__v8bf)__W, (__mmask8)__U); } @@ -205,7 +208,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)_mm_setzero_si128(), + (__v8bf)_mm_setzero_si128(), (__mmask8)__U); } @@ -221,7 +224,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_cvtneps_pbh(__m256 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)_mm_undefined_si128(), + (__v8bf)_mm_undefined_si128(), (__mmask8)-1); } @@ -242,7 +245,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)__W, + (__v8bf)__W, (__mmask8)__U); } @@ -261,7 +264,7 @@ static __inline__ __m128bh __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)_mm_setzero_si128(), + (__v8bf)_mm_setzero_si128(), (__mmask8)__U); } @@ -413,11 +416,11 @@ /// A float data. /// \returns A bf16 data whose sign field and exponent field keep unchanged, /// and fraction field is truncated to 7 bits. -static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { +static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { __v4sf __V = {__A, 0, 0, 0}; - __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask( - (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); - return (__bfloat16)__R[0]; + __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask( + (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1); + return (__bf16)__R[0]; } /// Convert Packed BF16 Data to Packed float Data. @@ -520,3 +523,4 @@ #undef __DEFAULT_FN_ATTRS256 #endif +#endif diff --git a/clang/test/CodeGen/X86/avx512bf16-builtins.c b/clang/test/CodeGen/X86/avx512bf16-builtins.c --- a/clang/test/CodeGen/X86/avx512bf16-builtins.c +++ b/clang/test/CodeGen/X86/avx512bf16-builtins.c @@ -4,11 +4,8 @@ #include -float test_mm_cvtsbh_ss(__bfloat16 A) { - // CHECK-LABEL: @test_mm_cvtsbh_ss - // CHECK: zext i16 %{{.*}} to i32 - // CHECK: shl i32 %{{.*}}, 16 - // CHECK: bitcast i32 %{{.*}} to float +float test_mm_cvtsbh_ss(__bf16 A) { + // CHECK: fpext bfloat %{{.*}} to float // CHECK: ret float %{{.*}} return _mm_cvtsbh_ss(A); } @@ -16,46 +13,46 @@ __m512bh test_mm512_cvtne2ps_pbh(__m512 A, __m512 B) { // CHECK-LABEL: @test_mm512_cvtne2ps_pbh // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_cvtne2ps_pbh(A, B); } __m512bh test_mm512_maskz_cvtne2ps_pbh(__m512 A, __m512 B, __mmask32 U) { // CHECK-LABEL: @test_mm512_maskz_cvtne2ps_pbh // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_maskz_cvtne2ps_pbh(U, A, B); } __m512bh test_mm512_mask_cvtne2ps_pbh(__m512bh C, __mmask32 U, __m512 A, __m512 B) { // CHECK-LABEL: @test_mm512_mask_cvtne2ps_pbh // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_mask_cvtne2ps_pbh(C, U, A, B); } __m256bh test_mm512_cvtneps_pbh(__m512 A) { // CHECK-LABEL: @test_mm512_cvtneps_pbh // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512 - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm512_cvtneps_pbh(A); } __m256bh test_mm512_mask_cvtneps_pbh(__m256bh C, __mmask16 U, __m512 A) { // CHECK-LABEL: @test_mm512_mask_cvtneps_pbh // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512 - // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm512_mask_cvtneps_pbh(C, U, A); } __m256bh test_mm512_maskz_cvtneps_pbh(__m512 A, __mmask16 U) { // CHECK-LABEL: @test_mm512_maskz_cvtneps_pbh // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.512 - // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm512_maskz_cvtneps_pbh(U, A); } diff --git a/clang/test/CodeGen/X86/avx512bf16-error.c b/clang/test/CodeGen/X86/avx512bf16-error.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512bf16-error.c @@ -0,0 +1,15 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -ffreestanding -triple x86_64-linux-pc %s + +// expected-error@+1 3 {{unknown type name '__bfloat16'}} +__bfloat16 foo(__bfloat16 a, __bfloat16 b) { + return a + b; +} + +#include + +// expected-error@+4 {{invalid operands to binary expression ('__bfloat16' (aka '__bf16') and '__bfloat16')}} +// expected-warning@+2 3 {{'__bfloat16' is deprecated: use __bf16 instead}} +// expected-note@* 3 {{'__bfloat16' has been explicitly marked deprecated here}} +__bfloat16 bar(__bfloat16 a, __bfloat16 b) { + return a + b; +} diff --git a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c --- a/clang/test/CodeGen/X86/avx512vlbf16-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbf16-builtins.c @@ -7,113 +7,113 @@ __m128bh test_mm_cvtne2ps2bf16(__m128 A, __m128 B) { // CHECK-LABEL: @test_mm_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128 - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_cvtne2ps_pbh(A, B); } __m128bh test_mm_maskz_cvtne2ps2bf16(__m128 A, __m128 B, __mmask8 U) { // CHECK-LABEL: @test_mm_maskz_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128 - // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_maskz_cvtne2ps_pbh(U, A, B); } __m128bh test_mm_mask_cvtne2ps2bf16(__m128bh C, __mmask8 U, __m128 A, __m128 B) { // CHECK-LABEL: @test_mm_mask_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.128 - // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_mask_cvtne2ps_pbh(C, U, A, B); } __m256bh test_mm256_cvtne2ps2bf16(__m256 A, __m256 B) { // CHECK-LABEL: @test_mm256_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256 - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm256_cvtne2ps_pbh(A, B); } __m256bh test_mm256_maskz_cvtne2ps2bf16(__m256 A, __m256 B, __mmask16 U) { // CHECK-LABEL: @test_mm256_maskz_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256 - // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm256_maskz_cvtne2ps_pbh(U, A, B); } __m256bh test_mm256_mask_cvtne2ps2bf16(__m256bh C, __mmask16 U, __m256 A, __m256 B) { // CHECK-LABEL: @test_mm256_mask_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.256 - // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} - // CHECK: ret <16 x i16> %{{.*}} + // CHECK: select <16 x i1> %{{.*}}, <16 x bfloat> %{{.*}}, <16 x bfloat> %{{.*}} + // CHECK: ret <16 x bfloat> %{{.*}} return _mm256_mask_cvtne2ps_pbh(C, U, A, B); } __m512bh test_mm512_cvtne2ps2bf16(__m512 A, __m512 B) { // CHECK-LABEL: @test_mm512_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_cvtne2ps_pbh(A, B); } __m512bh test_mm512_maskz_cvtne2ps2bf16(__m512 A, __m512 B, __mmask32 U) { // CHECK-LABEL: @test_mm512_maskz_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_maskz_cvtne2ps_pbh(U, A, B); } __m512bh test_mm512_mask_cvtne2ps2bf16(__m512bh C, __mmask32 U, __m512 A, __m512 B) { // CHECK-LABEL: @test_mm512_mask_cvtne2ps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtne2ps2bf16.512 - // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} - // CHECK: ret <32 x i16> %{{.*}} + // CHECK: select <32 x i1> %{{.*}}, <32 x bfloat> %{{.*}}, <32 x bfloat> %{{.*}} + // CHECK: ret <32 x bfloat> %{{.*}} return _mm512_mask_cvtne2ps_pbh(C, U, A, B); } __m128bh test_mm_cvtneps2bf16(__m128 A) { // CHECK-LABEL: @test_mm_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128 - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_cvtneps_pbh(A); } __m128bh test_mm_mask_cvtneps2bf16(__m128bh C, __mmask8 U, __m128 A) { // CHECK-LABEL: @test_mm_mask_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16. - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_mask_cvtneps_pbh(C, U, A); } __m128bh test_mm_maskz_cvtneps2bf16(__m128 A, __mmask8 U) { // CHECK-LABEL: @test_mm_maskz_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128 - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm_maskz_cvtneps_pbh(U, A); } __m128bh test_mm256_cvtneps2bf16(__m256 A) { // CHECK-LABEL: @test_mm256_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256 - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm256_cvtneps_pbh(A); } __m128bh test_mm256_mask_cvtneps2bf16(__m128bh C, __mmask8 U, __m256 A) { // CHECK-LABEL: @test_mm256_mask_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256 - // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm256_mask_cvtneps_pbh(C, U, A); } __m128bh test_mm256_maskz_cvtneps2bf16(__m256 A, __mmask8 U) { // CHECK-LABEL: @test_mm256_maskz_cvtneps2bf16 // CHECK: @llvm.x86.avx512bf16.cvtneps2bf16.256 - // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} - // CHECK: ret <8 x i16> %{{.*}} + // CHECK: select <8 x i1> %{{.*}}, <8 x bfloat> %{{.*}}, <8 x bfloat> %{{.*}} + // CHECK: ret <8 x bfloat> %{{.*}} return _mm256_maskz_cvtneps_pbh(U, A); } @@ -162,10 +162,10 @@ return _mm256_mask_dpbf16_ps(D, U, A, B); } -__bfloat16 test_mm_cvtness_sbh(float A) { +__bf16 test_mm_cvtness_sbh(float A) { // CHECK-LABEL: @test_mm_cvtness_sbh // CHECK: @llvm.x86.avx512bf16.mask.cvtneps2bf16.128 - // CHECK: ret i16 %{{.*}} + // CHECK: ret bfloat %{{.*}} return _mm_cvtness_sbh(A); } diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -327,6 +327,8 @@ def llvm_v2bf16_ty : LLVMType; // 2 x bfloat (__bf16) def llvm_v4bf16_ty : LLVMType; // 4 x bfloat (__bf16) def llvm_v8bf16_ty : LLVMType; // 8 x bfloat (__bf16) +def llvm_v16bf16_ty : LLVMType; // 16 x bfloat (__bf16) +def llvm_v32bf16_ty : LLVMType; // 32 x bfloat (__bf16) def llvm_v1f32_ty : LLVMType; // 1 x float def llvm_v2f32_ty : LLVMType; // 2 x float def llvm_v3f32_ty : LLVMType; // 3 x float diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -4901,27 +4901,27 @@ let TargetPrefix = "x86" in { def int_x86_avx512bf16_cvtne2ps2bf16_128: ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], + Intrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtne2ps2bf16_256: ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v8f32_ty, llvm_v8f32_ty], + Intrinsic<[llvm_v16bf16_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtne2ps2bf16_512: ClangBuiltin<"__builtin_ia32_cvtne2ps2bf16_512">, - Intrinsic<[llvm_v32i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty], + Intrinsic<[llvm_v32bf16_ty], [llvm_v16f32_ty, llvm_v16f32_ty], [IntrNoMem]>; // Intrinsic must be masked due to it producing less than 128 bits of results. def int_x86_avx512bf16_mask_cvtneps2bf16_128: - Intrinsic<[llvm_v8i16_ty], - [llvm_v4f32_ty, llvm_v8i16_ty, llvm_v4i1_ty], + Intrinsic<[llvm_v8bf16_ty], + [llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v4i1_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtneps2bf16_256: ClangBuiltin<"__builtin_ia32_cvtneps2bf16_256">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v8bf16_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtneps2bf16_512: ClangBuiltin<"__builtin_ia32_cvtneps2bf16_512">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v16bf16_ty], [llvm_v16f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_dpbf16ps_128: ClangBuiltin<"__builtin_ia32_dpbf16ps_128">, Intrinsic<[llvm_v4f32_ty], diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -82,6 +82,16 @@ return true; } +static bool UpgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID, + Function *&NewFn) { + if (F->getReturnType()->getScalarType()->isBFloatTy()) + return false; + + rename(F); + NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + return true; +} + static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { // All of the intrinsics matches below should be marked with which llvm // version started autoupgrading them. At some point in the future we would @@ -488,6 +498,24 @@ if (Name == "avx512.mask.cmp.ps.512") // Added in 7.0 return UpgradeX86MaskedFPCompare(F, Intrinsic::x86_avx512_mask_cmp_ps_512, NewFn); + if (Name == "avx512bf16.cvtne2ps2bf16.128") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_cvtne2ps2bf16_128, NewFn); + if (Name == "avx512bf16.cvtne2ps2bf16.256") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_cvtne2ps2bf16_256, NewFn); + if (Name == "avx512bf16.cvtne2ps2bf16.512") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_cvtne2ps2bf16_512, NewFn); + if (Name == "avx512bf16.mask.cvtneps2bf16.128") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128, NewFn); + if (Name == "avx512bf16.cvtneps2bf16.256") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_cvtneps2bf16_256, NewFn); + if (Name == "avx512bf16.cvtneps2bf16.512") // Added in 9.0 + return UpgradeX86BF16Intrinsic( + F, Intrinsic::x86_avx512bf16_cvtneps2bf16_512, NewFn); // frcz.ss/sd may need to have an argument dropped. Added in 3.2 if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) { @@ -4054,6 +4082,29 @@ return; } + case Intrinsic::x86_avx512bf16_cvtne2ps2bf16_128: + case Intrinsic::x86_avx512bf16_cvtne2ps2bf16_256: + case Intrinsic::x86_avx512bf16_cvtne2ps2bf16_512: + case Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128: + case Intrinsic::x86_avx512bf16_cvtneps2bf16_256: + case Intrinsic::x86_avx512bf16_cvtneps2bf16_512: { + SmallVector Args(CI->args()); + unsigned NumElts = cast(CI->getType())->getNumElements(); + if (NewFn->getIntrinsicID() == + Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128) + Args[1] = Builder.CreateBitCast( + Args[1], FixedVectorType::get(Builder.getBFloatTy(), NumElts)); + + NewCall = Builder.CreateCall(NewFn, Args); + Value *Res = Builder.CreateBitCast( + NewCall, FixedVectorType::get(Builder.getInt16Ty(), NumElts)); + + NewCall->takeName(CI); + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + return; + } + case Intrinsic::thread_pointer: { NewCall = Builder.CreateCall(NewFn, {}); break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2178,6 +2178,17 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) { + addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass); + addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass); + addRegisterClass(MVT::v32bf16, &X86::VR512RegClass); + setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v8bf16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v16bf16, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom); + addLegalFPImmediate(APFloat::getZero(APFloat::BFloat())); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); @@ -9921,6 +9932,18 @@ return NV; } +// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types. +static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Op.getSimpleValueType(); + MVT IVT = VT.changeVectorElementTypeToInteger(); + SmallVector NewOps; + for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) + NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); + return DAG.getBitcast(VT, Res); +} + // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -11075,6 +11098,9 @@ if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); + if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16()) + return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget); + if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) return VectorConstant; diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -53,34 +53,36 @@ string EltTypeName = !cast(EltVT); // Size of the element type in bits, e.g. 32 for v16i32. - string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName)); + string EltSizeName = !subst("i", "", !subst("f", "", !subst("b", "", EltTypeName))); int EltSize = EltVT.Size; // "i" for integer types and "f" for floating-point types - string TypeVariantName = !subst(EltSizeName, "", EltTypeName); + string TypeVariantName = !subst("b", "", !subst(EltSizeName, "", EltTypeName)); // Size of RC in bits, e.g. 512 for VR512. int Size = VT.Size; // The corresponding memory operand, e.g. i512mem for VR512. X86MemOperand MemOp = !cast(TypeVariantName # Size # "mem"); - X86MemOperand ScalarMemOp = !cast(EltVT # "mem"); + X86MemOperand ScalarMemOp = !cast(!subst("b", "", EltTypeName) # "mem"); // FP scalar memory operand for intrinsics - ssmem/sdmem. Operand IntScalarMemOp = !if (!eq (EltTypeName, "f16"), !cast("shmem"), + !if (!eq (EltTypeName, "bf16"), !cast("shmem"), !if (!eq (EltTypeName, "f32"), !cast("ssmem"), - !if (!eq (EltTypeName, "f64"), !cast("sdmem"), ?))); + !if (!eq (EltTypeName, "f64"), !cast("sdmem"), ?)))); // Load patterns PatFrag LdFrag = !cast("load" # VTName); PatFrag AlignedLdFrag = !cast("alignedload" # VTName); - PatFrag ScalarLdFrag = !cast("load" # EltVT); + PatFrag ScalarLdFrag = !cast("load" # !subst("b", "", EltTypeName)); PatFrag BroadcastLdFrag = !cast("X86VBroadcastld" # EltSizeName); PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f16"), !cast("sse_load_f16"), + !if (!eq (EltTypeName, "bf16"), !cast("sse_load_f16"), !if (!eq (EltTypeName, "f32"), !cast("sse_load_f32"), - !if (!eq (EltTypeName, "f64"), !cast("sse_load_f64"), ?))); + !if (!eq (EltTypeName, "f64"), !cast("sse_load_f64"), ?)))); // The string to specify embedded broadcast in assembly. string BroadcastStr = "{1to" # NumElts # "}"; @@ -96,11 +98,13 @@ Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, !if (!eq (EltTypeName, "f64"), SSEPackedDouble, !if (!eq (EltTypeName, "f16"), SSEPackedSingle, // FIXME? - SSEPackedInt))); + !if (!eq (EltTypeName, "bf16"), SSEPackedSingle, // FIXME? + SSEPackedInt)))); RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, !if (!eq (EltTypeName, "f16"), FR16X, - FR64X)); + !if (!eq (EltTypeName, "bf16"), FR16X, + FR64X))); dag ImmAllZerosV = (VT immAllZerosV); @@ -113,6 +117,7 @@ def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">; +def v32bf16_info: X86VectorVTInfo<32, bf16, VR512, "pbf">; def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; @@ -122,6 +127,7 @@ def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">; +def v16bf16x_info: X86VectorVTInfo<16, bf16, VR256X, "pbf">; def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; @@ -130,6 +136,7 @@ def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; def v8f16x_info : X86VectorVTInfo<8, f16, VR128X, "ph">; +def v8bf16x_info : X86VectorVTInfo<8, bf16, VR128X, "pbf">; def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; @@ -138,6 +145,7 @@ def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; def f16x_info : X86VectorVTInfo<1, f16, VR128X, "sh">; +def bf16x_info : X86VectorVTInfo<1, bf16, VR128X, "sbf">; def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; @@ -158,6 +166,8 @@ v2i64x_info>; def avx512vl_f16_info : AVX512VLVectorVTInfo; +def avx512vl_bf16_info : AVX512VLVectorVTInfo; def avx512vl_f32_info : AVX512VLVectorVTInfo; def avx512vl_f64_info : AVX512VLVectorVTInfo; defm : mask_move_lowering<"VMOVDQU16Z", v16f16x_info, v32f16_info>; + + defm : mask_move_lowering<"VMOVDQU16Z", v8bf16x_info, v32bf16_info>; + defm : mask_move_lowering<"VMOVDQU16Z", v16bf16x_info, v32bf16_info>; } let Predicates = [HasAVX512] in { @@ -3771,6 +3784,8 @@ (VMOVDQA64Zrm addr:$src)>; def : Pat<(alignedloadv32f16 addr:$src), (VMOVAPSZrm addr:$src)>; + def : Pat<(alignedloadv32bf16 addr:$src), + (VMOVAPSZrm addr:$src)>; def : Pat<(alignedloadv64i8 addr:$src), (VMOVDQA64Zrm addr:$src)>; def : Pat<(loadv16i32 addr:$src), @@ -3779,6 +3794,8 @@ (VMOVDQU64Zrm addr:$src)>; def : Pat<(loadv32f16 addr:$src), (VMOVUPSZrm addr:$src)>; + def : Pat<(loadv32bf16 addr:$src), + (VMOVUPSZrm addr:$src)>; def : Pat<(loadv64i8 addr:$src), (VMOVDQU64Zrm addr:$src)>; @@ -3789,6 +3806,8 @@ (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), (VMOVAPSZmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v32bf16 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), (VMOVDQA64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v16i32 VR512:$src), addr:$dst), @@ -3797,6 +3816,8 @@ (VMOVDQU64Zmr addr:$dst, VR512:$src)>; def : Pat<(store (v32f16 VR512:$src), addr:$dst), (VMOVUPSZmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32bf16 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; def : Pat<(store (v64i8 VR512:$src), addr:$dst), (VMOVDQU64Zmr addr:$dst, VR512:$src)>; } @@ -3809,6 +3830,8 @@ (VMOVDQA64Z128rm addr:$src)>; def : Pat<(alignedloadv8f16 addr:$src), (VMOVAPSZ128rm addr:$src)>; + def : Pat<(alignedloadv8bf16 addr:$src), + (VMOVAPSZ128rm addr:$src)>; def : Pat<(alignedloadv16i8 addr:$src), (VMOVDQA64Z128rm addr:$src)>; def : Pat<(loadv4i32 addr:$src), @@ -3817,6 +3840,8 @@ (VMOVDQU64Z128rm addr:$src)>; def : Pat<(loadv8f16 addr:$src), (VMOVUPSZ128rm addr:$src)>; + def : Pat<(loadv8bf16 addr:$src), + (VMOVUPSZ128rm addr:$src)>; def : Pat<(loadv16i8 addr:$src), (VMOVDQU64Z128rm addr:$src)>; @@ -3827,6 +3852,8 @@ (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(alignedstore (v8bf16 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v4i32 VR128X:$src), addr:$dst), @@ -3835,6 +3862,8 @@ (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v8f16 VR128X:$src), addr:$dst), (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8bf16 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>; @@ -3845,6 +3874,8 @@ (VMOVDQA64Z256rm addr:$src)>; def : Pat<(alignedloadv16f16 addr:$src), (VMOVAPSZ256rm addr:$src)>; + def : Pat<(alignedloadv16bf16 addr:$src), + (VMOVAPSZ256rm addr:$src)>; def : Pat<(alignedloadv32i8 addr:$src), (VMOVDQA64Z256rm addr:$src)>; def : Pat<(loadv8i32 addr:$src), @@ -3853,6 +3884,8 @@ (VMOVDQU64Z256rm addr:$src)>; def : Pat<(loadv16f16 addr:$src), (VMOVUPSZ256rm addr:$src)>; + def : Pat<(loadv16bf16 addr:$src), + (VMOVUPSZ256rm addr:$src)>; def : Pat<(loadv32i8 addr:$src), (VMOVDQU64Z256rm addr:$src)>; @@ -3863,6 +3896,8 @@ (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(alignedstore (v16bf16 VR256X:$src), addr:$dst), + (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v8i32 VR256X:$src), addr:$dst), @@ -3871,89 +3906,97 @@ (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v16f16 VR256X:$src), addr:$dst), (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16bf16 VR256X:$src), addr:$dst), + (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } + +multiclass mask_move_lowering_f16_bf16 { let Predicates = [HasBWI] in { - def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, (_.info512.VT VR512:$src1), (_.info512.VT VR512:$src0))), (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; - def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, (_.info512.VT VR512:$src1), _.info512.ImmAllZerosV)), (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>; - def : Pat<(v32f16 (vselect VK32WM:$mask, - (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, + (_.info512.VT (_.info512.AlignedLdFrag addr:$src)), (_.info512.VT VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (vselect VK32WM:$mask, - (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, + (_.info512.VT (_.info512.AlignedLdFrag addr:$src)), _.info512.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (vselect VK32WM:$mask, - (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, + (_.info512.VT (_.info512.LdFrag addr:$src)), (_.info512.VT VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (vselect VK32WM:$mask, - (v32f16 (loadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), + def : Pat<(_.info512.VT (vselect VK32WM:$mask, + (_.info512.VT (_.info512.LdFrag addr:$src)), _.info512.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, (v32f16 VR512:$src0))), + def : Pat<(_.info512.VT (masked_load addr:$src, VK32WM:$mask, (_.info512.VT VR512:$src0))), (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, undef)), + def : Pat<(_.info512.VT (masked_load addr:$src, VK32WM:$mask, undef)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)), + def : Pat<(_.info512.VT (masked_load addr:$src, VK32WM:$mask, _.info512.ImmAllZerosV)), (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; - def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), + def : Pat<(masked_store (_.info512.VT VR512:$src), addr:$dst, VK32WM:$mask), (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; } let Predicates = [HasBWI, HasVLX] in { - def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src0))), (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; - def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, (_.info256.VT VR256X:$src1), _.info256.ImmAllZerosV)), (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>; - def : Pat<(v16f16 (vselect VK16WM:$mask, - (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, + (_.info256.VT (_.info256.AlignedLdFrag addr:$src)), (_.info256.VT VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (vselect VK16WM:$mask, - (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, + (_.info256.VT (_.info256.AlignedLdFrag addr:$src)), _.info256.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (vselect VK16WM:$mask, - (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, + (_.info256.VT (_.info256.LdFrag addr:$src)), (_.info256.VT VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (vselect VK16WM:$mask, - (v16f16 (loadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), + def : Pat<(_.info256.VT (vselect VK16WM:$mask, + (_.info256.VT (_.info256.LdFrag addr:$src)), _.info256.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, (v16f16 VR256X:$src0))), + def : Pat<(_.info256.VT (masked_load addr:$src, VK16WM:$mask, (_.info256.VT VR256X:$src0))), (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, undef)), + def : Pat<(_.info256.VT (masked_load addr:$src, VK16WM:$mask, undef)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)), + def : Pat<(_.info256.VT (masked_load addr:$src, VK16WM:$mask, _.info256.ImmAllZerosV)), (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; - def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask), + def : Pat<(masked_store (_.info256.VT VR256X:$src), addr:$dst, VK16WM:$mask), (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), (v8f16 VR128X:$src0))), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, (_.info128.VT VR128X:$src1), (_.info128.VT VR128X:$src0))), (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, (_.info128.VT VR128X:$src1), _.info128.ImmAllZerosV)), (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, - (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, + (_.info128.VT (_.info128.AlignedLdFrag addr:$src)), (_.info128.VT VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, - (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, + (_.info128.VT (_.info128.AlignedLdFrag addr:$src)), _.info128.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, - (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, + (_.info128.VT (_.info128.LdFrag addr:$src)), (_.info128.VT VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (vselect VK8WM:$mask, - (v8f16 (loadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), + def : Pat<(_.info128.VT (vselect VK8WM:$mask, + (_.info128.VT (_.info128.LdFrag addr:$src)), _.info128.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, (v8f16 VR128X:$src0))), + def : Pat<(_.info128.VT (masked_load addr:$src, VK8WM:$mask, (_.info128.VT VR128X:$src0))), (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, undef)), + def : Pat<(_.info128.VT (masked_load addr:$src, VK8WM:$mask, undef)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)), + def : Pat<(_.info128.VT (masked_load addr:$src, VK8WM:$mask, _.info128.ImmAllZerosV)), (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; - def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask), + def : Pat<(masked_store (_.info128.VT VR128X:$src), addr:$dst, VK8WM:$mask), (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>; } +} + +defm : mask_move_lowering_f16_bf16; +defm : mask_move_lowering_f16_bf16; // Move Int Doubleword to Packed Double Int // @@ -12811,7 +12854,7 @@ let ExeDomain = SSEPackedSingle in defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16", SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF - avx512vl_f32_info, avx512vl_i16_info, + avx512vl_f32_info, avx512vl_bf16_info, X86cvtne2ps2bf16, HasBF16, 0>, T8XD; // Truncate Float to BFloat16 @@ -12819,15 +12862,15 @@ X86SchedWriteWidths sched> { let ExeDomain = SSEPackedSingle in { let Predicates = [HasBF16], Uses = [], mayRaiseFPException = 0 in { - defm Z : avx512_vcvt_fp, EVEX_V512; } let Predicates = [HasBF16, HasVLX] in { let Uses = [], mayRaiseFPException = 0 in { - defm Z128 : avx512_vcvt_fp, EVEX_V128; - defm Z256 : avx512_vcvt_fp, EVEX_V256; } @@ -12855,32 +12898,32 @@ let Predicates = [HasBF16, HasVLX] in { // Special patterns to allow use of X86mcvtneps2bf16 for masking. Instruction // patterns have been disabled with null_frag. - def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), + def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 VR128X:$src))), (VCVTNEPS2BF16Z128rr VR128X:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8i16 VR128X:$src0), + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), (v8bf16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rrk VR128X:$src0, VK4WM:$mask, VR128X:$src)>; - def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8i16x_info.ImmAllZerosV, + def : Pat<(X86mcvtneps2bf16 (v4f32 VR128X:$src), v8bf16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rrkz VK4WM:$mask, VR128X:$src)>; - def : Pat<(v8i16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), + def : Pat<(v8bf16 (X86cvtneps2bf16 (loadv4f32 addr:$src))), (VCVTNEPS2BF16Z128rm addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8i16 VR128X:$src0), + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), (v8bf16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmk VR128X:$src0, VK4WM:$mask, addr:$src)>; - def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8i16x_info.ImmAllZerosV, + def : Pat<(X86mcvtneps2bf16 (loadv4f32 addr:$src), v8bf16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmkz VK4WM:$mask, addr:$src)>; - def : Pat<(v8i16 (X86cvtneps2bf16 (v4f32 + def : Pat<(v8bf16 (X86cvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)))), (VCVTNEPS2BF16Z128rmb addr:$src)>; def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), - (v8i16 VR128X:$src0), VK4WM:$mask), + (v8bf16 VR128X:$src0), VK4WM:$mask), (VCVTNEPS2BF16Z128rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>; def : Pat<(X86mcvtneps2bf16 (v4f32 (X86VBroadcastld32 addr:$src)), - v8i16x_info.ImmAllZerosV, VK4WM:$mask), + v8bf16x_info.ImmAllZerosV, VK4WM:$mask), (VCVTNEPS2BF16Z128rmbkz VK4WM:$mask, addr:$src)>; } diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -785,18 +785,18 @@ // cvt fp to bfloat16 def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16", - SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>, + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, bf16>, SDTCVecEltisVT<1, f32>, SDTCisSameSizeAs<0,1>, SDTCisSameAs<1,2>]>>; def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16", - SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>, + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, bf16>, SDTCVecEltisVT<1, f32>, SDTCisSameAs<0, 2>, SDTCVecEltisVT<3, i1>, SDTCisSameNumEltsAs<1, 3>]>>; def X86cvtneps2bf16 : SDNode<"X86ISD::CVTNEPS2BF16", - SDTypeProfile<1, 1, [SDTCVecEltisVT<0, i16>, + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, bf16>, SDTCVecEltisVT<1, f32>]>>; def X86dpbf16ps : SDNode<"X86ISD::DPBF16PS", SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>, @@ -819,6 +819,7 @@ // 128-bit load pattern fragments def loadv8f16 : PatFrag<(ops node:$ptr), (v8f16 (load node:$ptr))>; +def loadv8bf16 : PatFrag<(ops node:$ptr), (v8bf16 (load node:$ptr))>; def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; @@ -828,6 +829,7 @@ // 256-bit load pattern fragments def loadv16f16 : PatFrag<(ops node:$ptr), (v16f16 (load node:$ptr))>; +def loadv16bf16 : PatFrag<(ops node:$ptr), (v16bf16 (load node:$ptr))>; def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; @@ -837,6 +839,7 @@ // 512-bit load pattern fragments def loadv32f16 : PatFrag<(ops node:$ptr), (v32f16 (load node:$ptr))>; +def loadv32bf16 : PatFrag<(ops node:$ptr), (v32bf16 (load node:$ptr))>; def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; @@ -870,6 +873,8 @@ // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv8f16 : PatFrag<(ops node:$ptr), (v8f16 (alignedload node:$ptr))>; +def alignedloadv8bf16 : PatFrag<(ops node:$ptr), + (v8bf16 (alignedload node:$ptr))>; def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), @@ -887,6 +892,8 @@ // NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv16f16 : PatFrag<(ops node:$ptr), (v16f16 (alignedload node:$ptr))>; +def alignedloadv16bf16 : PatFrag<(ops node:$ptr), + (v16bf16 (alignedload node:$ptr))>; def alignedloadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (alignedload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), @@ -903,6 +910,8 @@ // 512-bit aligned load pattern fragments def alignedloadv32f16 : PatFrag<(ops node:$ptr), (v32f16 (alignedload node:$ptr))>; +def alignedloadv32bf16 : PatFrag<(ops node:$ptr), + (v32bf16 (alignedload node:$ptr))>; def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -569,9 +569,9 @@ // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -589,7 +589,7 @@ } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v64i8, v32i16, v16i32, v8i64], +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v32bf16, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Represents the lower 16 registers that have VEX/legacy encodable subregs. @@ -604,9 +604,9 @@ def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;} // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers diff --git a/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll copy from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll copy to llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-intrinsics-upgrade.ll @@ -18,14 +18,16 @@ define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 { ; X86-LABEL: test_mm512_maskz_cvtne2ps2bf16_512: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x72,0xc1] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x72,0xc1] +; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm512_maskz_cvtne2ps2bf16_512: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x72,0xc1] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x72,0xc1] +; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 @@ -38,14 +40,16 @@ define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 { ; X86-LABEL: test_mm512_mask_cvtne2ps2bf16_512: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %zmm2, %zmm1, %zmm1 # encoding: [0x62,0xf2,0x77,0x48,0x72,0xca] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ps2bf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x72,0xc2] +; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm512_mask_cvtne2ps2bf16_512: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %zmm2, %zmm1, %zmm1 # encoding: [0x62,0xf2,0x77,0x48,0x72,0xca] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x72,0xc2] +; X64-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 @@ -72,14 +76,18 @@ define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 { ; X86-LABEL: test_mm512_maskz_cvtneps2bf16_512: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneps2bf16 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x72,0xc0] +; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xc0] +; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm512_maskz_cvtneps2bf16_512: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtneps2bf16 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneps2bf16 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x72,0xc0] +; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xc0] +; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 @@ -92,14 +100,20 @@ define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 { ; X86-LABEL: test_mm512_mask_cvtneps2bf16_512: ; X86: # %bb.0: # %entry +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: vcvtneps2bf16 %zmm1, %ymm1 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtneps2bf16 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x72,0xc1] +; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] +; X86-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm512_mask_cvtneps2bf16_512: ; X64: # %bb.0: # %entry +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: vcvtneps2bf16 %zmm1, %ymm1 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneps2bf16 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x72,0xc1] +; X64-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] +; X64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 diff --git a/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 -declare <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3 +declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3 define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 { ; CHECK-LABEL: test_mm512_cvtne2ps2bf16_512: @@ -10,8 +10,8 @@ ; CHECK-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7f,0x48,0x72,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 - %1 = bitcast <32 x i16> %0 to <8 x i64> + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <32 x bfloat> %0 to <8 x i64> ret <8 x i64> %1 } @@ -28,10 +28,10 @@ ; X64-NEXT: vcvtne2ps2bf16 %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xc9,0x72,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 %1 = bitcast i32 %U to <32 x i1> - %2 = select <32 x i1> %1, <32 x i16> %0, <32 x i16> zeroinitializer - %3 = bitcast <32 x i16> %2 to <8 x i64> + %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer + %3 = bitcast <32 x bfloat> %2 to <8 x i64> ret <8 x i64> %3 } @@ -48,15 +48,15 @@ ; X64-NEXT: vcvtne2ps2bf16 %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x49,0x72,0xc2] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 - %1 = bitcast <8 x i64> %C to <32 x i16> + %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4 + %1 = bitcast <8 x i64> %C to <32 x bfloat> %2 = bitcast i32 %U to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %0, <32 x i16> %1 - %4 = bitcast <32 x i16> %3 to <8 x i64> + %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1 + %4 = bitcast <32 x bfloat> %3 to <8 x i64> ret <8 x i64> %4 } -declare <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3 +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3 define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 { ; CHECK-LABEL: test_mm512_cvtneps2bf16_512: @@ -64,8 +64,8 @@ ; CHECK-NEXT: vcvtneps2bf16 %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x72,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 - %1 = bitcast <16 x i16> %0 to <4 x i64> + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> ret <4 x i64> %1 } @@ -82,10 +82,10 @@ ; X64-NEXT: vcvtneps2bf16 %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x72,0xc0] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 %1 = bitcast i16 %U to <16 x i1> - %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer - %3 = bitcast <16 x i16> %2 to <4 x i64> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> ret <4 x i64> %3 } @@ -102,11 +102,11 @@ ; X64-NEXT: vcvtneps2bf16 %zmm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x72,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 - %1 = bitcast <4 x i64> %C to <16 x i16> + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> %2 = bitcast i16 %U to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %0, <16 x i16> %1 - %4 = bitcast <16 x i16> %3 to <4 x i64> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> ret <4 x i64> %4 } diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll copy from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll copy to llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics-upgrade.ll @@ -18,15 +18,17 @@ define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 { ; X86-LABEL: test_mm_maskz_cvtne2ps2bf16_128: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1] +; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm_maskz_cvtne2ps2bf16_128: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1] +; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 @@ -39,15 +41,17 @@ define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { ; X86-LABEL: test_mm_mask_cvtne2ps2bf16_128: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm1 # encoding: [0x62,0xf2,0x77,0x08,0x72,0xca] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2] +; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm_mask_cvtne2ps2bf16_128: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm1 # encoding: [0x62,0xf2,0x77,0x08,0x72,0xca] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2] +; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 @@ -74,14 +78,16 @@ define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 { ; X86-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1] +; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm256_maskz_cvtne2ps2bf16_256: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1] +; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 @@ -94,14 +100,16 @@ define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { ; X86-LABEL: test_mm256_mask_cvtne2ps2bf16_256: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm1 # encoding: [0x62,0xf2,0x77,0x28,0x72,0xca] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2] +; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm256_mask_cvtne2ps2bf16_256: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm1 # encoding: [0x62,0xf2,0x77,0x28,0x72,0xca] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2] +; X64-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 @@ -129,16 +137,18 @@ define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 { ; X86-LABEL: test_mm256_maskz_cvtneps2bf16_256: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0] +; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm256_maskz_cvtneps2bf16_256: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtneps2bf16 %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneps2bf16 %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x72,0xc0] +; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: @@ -152,16 +162,18 @@ define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 { ; X86-LABEL: test_mm256_mask_cvtneps2bf16_256: ; X86: # %bb.0: # %entry +; X86-NEXT: vcvtneps2bf16 %ymm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1] +; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mm256_mask_cvtneps2bf16_256: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvtneps2bf16 %ymm1, %xmm1 # encoding: [0x62,0xf2,0x7e,0x28,0x72,0xc9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vcvtneps2bf16 %ymm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x72,0xc1] +; X64-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bf16 -mattr=+avx512vl --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64 -declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1 define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 { ; CHECK-LABEL: test_mm_cvtne2ps2bf16_128: @@ -10,8 +10,8 @@ ; CHECK-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7f,0x08,0x72,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 - %1 = bitcast <8 x i16> %0 to <2 x i64> + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> ret <2 x i64> %1 } @@ -29,10 +29,10 @@ ; X64-NEXT: vcvtne2ps2bf16 %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0x89,0x72,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 %1 = bitcast i8 %U to <8 x i1> - %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer - %3 = bitcast <8 x i16> %2 to <2 x i64> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> ret <2 x i64> %3 } @@ -50,15 +50,15 @@ ; X64-NEXT: vcvtne2ps2bf16 %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x77,0x09,0x72,0xc2] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 - %1 = bitcast <2 x i64> %C to <8 x i16> + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2 + %1 = bitcast <2 x i64> %C to <8 x bfloat> %2 = bitcast i8 %U to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 - %4 = bitcast <8 x i16> %3 to <2 x i64> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> ret <2 x i64> %4 } -declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3 define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 { ; CHECK-LABEL: test_mm256_cvtne2ps2bf16_256: @@ -66,8 +66,8 @@ ; CHECK-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0x7f,0x28,0x72,0xc1] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 - %1 = bitcast <16 x i16> %0 to <4 x i64> + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <16 x bfloat> %0 to <4 x i64> ret <4 x i64> %1 } @@ -84,10 +84,10 @@ ; X64-NEXT: vcvtne2ps2bf16 %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7f,0xa9,0x72,0xc1] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 %1 = bitcast i16 %U to <16 x i1> - %2 = select <16 x i1> %1, <16 x i16> %0, <16 x i16> zeroinitializer - %3 = bitcast <16 x i16> %2 to <4 x i64> + %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer + %3 = bitcast <16 x bfloat> %2 to <4 x i64> ret <4 x i64> %3 } @@ -104,15 +104,15 @@ ; X64-NEXT: vcvtne2ps2bf16 %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x77,0x29,0x72,0xc2] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 - %1 = bitcast <4 x i64> %C to <16 x i16> + %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4 + %1 = bitcast <4 x i64> %C to <16 x bfloat> %2 = bitcast i16 %U to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %0, <16 x i16> %1 - %4 = bitcast <16 x i16> %3 to <4 x i64> + %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1 + %4 = bitcast <16 x bfloat> %3 to <4 x i64> ret <4 x i64> %4 } -declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3 define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 { ; CHECK-LABEL: test_mm256_cvtneps2bf16_256: @@ -121,8 +121,8 @@ ; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 - %1 = bitcast <8 x i16> %0 to <2 x i64> + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> ret <2 x i64> %1 } @@ -142,10 +142,10 @@ ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 %1 = bitcast i8 %U to <8 x i1> - %2 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> zeroinitializer - %3 = bitcast <8 x i16> %2 to <2 x i64> + %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer + %3 = bitcast <8 x bfloat> %2 to <2 x i64> ret <2 x i64> %3 } @@ -165,15 +165,15 @@ ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 - %1 = bitcast <2 x i64> %C to <8 x i16> + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4 + %1 = bitcast <2 x i64> %C to <8 x bfloat> %2 = bitcast i8 %U to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %0, <8 x i16> %1 - %4 = bitcast <8 x i16> %3 to <2 x i64> + %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> ret <2 x i64> %4 } -declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>) #3 +declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3 define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 { ; CHECK-LABEL: test_mm128_cvtneps2bf16_128: @@ -181,8 +181,8 @@ ; CHECK-NEXT: vcvtneps2bf16 %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x72,0xc0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] entry: - %0 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> ) #4 - %1 = bitcast <8 x i16> %0 to <2 x i64> + %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> ) #4 + %1 = bitcast <8 x bfloat> %0 to <2 x i64> ret <2 x i64> %1 } @@ -202,8 +202,8 @@ entry: %0 = bitcast i8 %U to <8 x i1> %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> zeroinitializer, <4 x i1> %1) #4 - %3 = bitcast <8 x i16> %2 to <2 x i64> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4 + %3 = bitcast <8 x bfloat> %2 to <2 x i64> ret <2 x i64> %3 } @@ -223,9 +223,9 @@ entry: %0 = bitcast i8 %U to <8 x i1> %1 = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> - %2 = bitcast <2 x i64> %C to <8 x i16> - %3 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> %2, <4 x i1> %1) #4 - %4 = bitcast <8 x i16> %3 to <2 x i64> + %2 = bitcast <2 x i64> %C to <8 x bfloat> + %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> ret <2 x i64> %4 } @@ -248,10 +248,10 @@ ; X64-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast i8 %U to <8 x i1> - %1 = bitcast <2 x i64> %C to <8 x i16> - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x i16> undef, <4 x i1> ) #4 - %3 = select <8 x i1> %0, <8 x i16> %2, <8 x i16> %1 - %4 = bitcast <8 x i16> %3 to <2 x i64> + %1 = bitcast <2 x i64> %C to <8 x bfloat> + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> undef, <4 x i1> ) #4 + %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1 + %4 = bitcast <8 x bfloat> %3 to <2 x i64> ret <2 x i64> %4 } diff --git a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll --- a/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-avx512bf16.ll @@ -9,7 +9,7 @@ ; By including a nop call with sideeffects we can force a partial register spill of the ; relevant registers and check that the reload is correctly folded into the instruction. -define <32 x i16> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) { +define <32 x bfloat> @stack_fold_cvtne2ps2bf16(<16 x float> %a0, <16 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -19,12 +19,12 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) - ret <32 x i16> %2 + %2 = call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) + ret <32 x bfloat> %2 } -declare <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) +declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) -define <32 x i16> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i32 %U) { +define <32 x bfloat> @stack_fold_cvtne2ps2bf16_mask(<16 x float> %a0, <16 x float> %a1, ptr %passthru, i32 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -37,15 +37,15 @@ ; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) + %2 = call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) %3 = bitcast i32 %U to <32 x i1> ; load needed to keep the operation from being scheduled above the asm block - %4 = load <32 x i16>, ptr %passthru - %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4 - ret <32 x i16> %5 + %4 = load <32 x bfloat>, ptr %passthru + %5 = select <32 x i1> %3, <32 x bfloat> %2, <32 x bfloat> %4 + ret <32 x bfloat> %5 } -define <32 x i16> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) { +define <32 x bfloat> @stack_fold_cvtne2ps2bf16_maskz(<16 x float> %a0, <16 x float> %a1, i32 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -56,13 +56,13 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <32 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) + %2 = call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %a0, <16 x float> %a1) %3 = bitcast i32 %U to <32 x i1> - %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer - ret <32 x i16> %4 + %4 = select <32 x i1> %3, <32 x bfloat> %2, <32 x bfloat> zeroinitializer + ret <32 x bfloat> %4 } -define <16 x i16> @stack_fold_cvtneps2bf16(<16 x float> %a0) { +define <16 x bfloat> @stack_fold_cvtneps2bf16(<16 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -72,12 +72,12 @@ ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) - ret <16 x i16> %2 + %2 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) + ret <16 x bfloat> %2 } -declare <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) -define <16 x i16> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, ptr %passthru, i16 %U) { +define <16 x bfloat> @stack_fold_cvtneps2bf16_mask(<16 x float> %a0, ptr %passthru, i16 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -90,15 +90,15 @@ ; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) + %2 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) %3 = bitcast i16 %U to <16 x i1> ; load needed to keep the operation from being scheduled above the asm block - %4 = load <16 x i16>, ptr %passthru - %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 - ret <16 x i16> %5 + %4 = load <16 x bfloat>, ptr %passthru + %5 = select <16 x i1> %3, <16 x bfloat> %2, <16 x bfloat> %4 + ret <16 x bfloat> %5 } -define <16 x i16> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) { +define <16 x bfloat> @stack_fold_cvtneps2bf16_maskz(<16 x float> %a0, i16 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill @@ -109,10 +109,10 @@ ; CHECK-NEXT: vcvtneps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 {%k1} {z} # 64-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <16 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) + %2 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %a0) %3 = bitcast i16 %U to <16 x i1> - %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer - ret <16 x i16> %4 + %4 = select <16 x i1> %3, <16 x bfloat> %2, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %4 } define <16 x float> @stack_fold_vdpbf16ps(<16 x float> %a0, <16 x i32> %a1, <16 x i32> %a2) { @@ -171,7 +171,7 @@ -define <16 x i16> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) { +define <16 x bfloat> @stack_fold_cvtne2ps2bf16_ymm(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -181,12 +181,12 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) - ret <16 x i16> %2 + %2 = call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) + ret <16 x bfloat> %2 } -declare <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) +declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) -define <16 x i16> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i16 %U) { +define <16 x bfloat> @stack_fold_cvtne2ps2bf16_mask_ymm(<8 x float> %a0, <8 x float> %a1, ptr %passthru, i16 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -199,15 +199,15 @@ ; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) + %2 = call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) %3 = bitcast i16 %U to <16 x i1> ; load needed to keep the operation from being scheduled above the asm block - %4 = load <16 x i16>, ptr %passthru - %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4 - ret <16 x i16> %5 + %4 = load <16 x bfloat>, ptr %passthru + %5 = select <16 x i1> %3, <16 x bfloat> %2, <16 x bfloat> %4 + ret <16 x bfloat> %5 } -define <16 x i16> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) { +define <16 x bfloat> @stack_fold_cvtne2ps2bf16_maskz_ymm(<8 x float> %a0, <8 x float> %a1, i16 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -218,13 +218,13 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 {%k1} {z} # 32-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) + %2 = call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %a0, <8 x float> %a1) %3 = bitcast i16 %U to <16 x i1> - %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer - ret <16 x i16> %4 + %4 = select <16 x i1> %3, <16 x bfloat> %2, <16 x bfloat> zeroinitializer + ret <16 x bfloat> %4 } -define <8 x i16> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_ymm(<8 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -235,12 +235,12 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) - ret <8 x i16> %2 + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) + ret <8 x bfloat> %2 } -declare <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) -define <8 x i16> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, ptr %passthru, i8 %U) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_mask_ymm(<8 x float> %a0, ptr %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -254,15 +254,15 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) %3 = bitcast i8 %U to <8 x i1> ; load needed to keep the operation from being scheduled above the asm block - %4 = load <8 x i16>, ptr %passthru - %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 - ret <8 x i16> %5 + %4 = load <8 x bfloat>, ptr %passthru + %5 = select <8 x i1> %3, <8 x bfloat> %2, <8 x bfloat> %4 + ret <8 x bfloat> %5 } -define <8 x i16> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_maskz_ymm(<8 x float> %a0, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_ymm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -274,10 +274,10 @@ ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %a0) %3 = bitcast i8 %U to <8 x i1> - %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer - ret <8 x i16> %4 + %4 = select <8 x i1> %3, <8 x bfloat> %2, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %4 } define <8 x float> @stack_fold_vdpbf16ps_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x i32> %a2) { @@ -338,7 +338,7 @@ -define <8 x i16> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) { +define <8 x bfloat> @stack_fold_cvtne2ps2bf16_xmm(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -348,12 +348,12 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) - ret <8 x i16> %2 + %2 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) + ret <8 x bfloat> %2 } -declare <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) +declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) -define <8 x i16> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %U) { +define <8 x bfloat> @stack_fold_cvtne2ps2bf16_mask_xmm(<4 x float> %a0, <4 x float> %a1, ptr %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_mask_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -366,15 +366,15 @@ ; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) + %2 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) %3 = bitcast i8 %U to <8 x i1> ; load needed to keep the operation from being scheduled above the asm block - %4 = load <8 x i16>, ptr %passthru - %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4 - ret <8 x i16> %5 + %4 = load <8 x bfloat>, ptr %passthru + %5 = select <8 x i1> %3, <8 x bfloat> %2, <8 x bfloat> %4 + ret <8 x bfloat> %5 } -define <8 x i16> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) { +define <8 x bfloat> @stack_fold_cvtne2ps2bf16_maskz_xmm(<4 x float> %a0, <4 x float> %a1, i8 %U) { ; CHECK-LABEL: stack_fold_cvtne2ps2bf16_maskz_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -385,13 +385,13 @@ ; CHECK-NEXT: vcvtne2ps2bf16 {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 {%k1} {z} # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x i16> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) + %2 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %a0, <4 x float> %a1) %3 = bitcast i8 %U to <8 x i1> - %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer - ret <8 x i16> %4 + %4 = select <8 x i1> %3, <8 x bfloat> %2, <8 x bfloat> zeroinitializer + ret <8 x bfloat> %4 } -define <8 x i16> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_xmm(<4 x float> %a0) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -401,12 +401,12 @@ ; CHECK-NEXT: vcvtneps2bf16x {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> undef, <4 x i1> ) - ret <8 x i16> %2 + %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x bfloat> undef, <4 x i1> ) + ret <8 x bfloat> %2 } -declare <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x i16>, <4 x i1>) +declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) -define <8 x i16> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, ptr %passthru, i8 %U) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_mask_xmm(<4 x float> %a0, ptr %passthru, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_mask_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -419,14 +419,14 @@ ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = load <8 x i16>, ptr %passthru + %2 = load <8 x bfloat>, ptr %passthru %3 = bitcast i8 %U to <8 x i1> %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> - %5 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> %2, <4 x i1> %4) - ret <8 x i16> %5 + %5 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x bfloat> %2, <4 x i1> %4) + ret <8 x bfloat> %5 } -define <8 x i16> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) { +define <8 x bfloat> @stack_fold_cvtneps2bf16_maskz_xmm(<4 x float> %a0, i8 %U) { ; CHECK-LABEL: stack_fold_cvtneps2bf16_maskz_xmm: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -439,8 +439,8 @@ %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = bitcast i8 %U to <8 x i1> %3 = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> - %4 = tail call <8 x i16> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x i16> zeroinitializer, <4 x i1> %3) - ret <8 x i16> %4 + %4 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %a0, <8 x bfloat> zeroinitializer, <4 x i1> %3) + ret <8 x bfloat> %4 } define <4 x float> @stack_fold_vdpbf16ps_xmm(<4 x float> %a0, <4 x i32> %a1, <4 x i32> %a2) {