diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -3553,6 +3553,8 @@ .. option:: -mavx512f, -mno-avx512f +.. option:: -mavx512fp16, -mno-avx512fp16 + .. option:: -mavx512ifma, -mno-avx512ifma .. option:: -mavx512pf, -mno-avx512pf diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -596,6 +596,7 @@ * 64-bit ARM (AArch64) * AMDGPU * SPIR +* X86 (Only available under feature AVX512-FP16) ``_Float16`` will be supported on more targets as they define ABIs for it. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -133,7 +133,7 @@ X86 Support in Clang -------------------- -- ... +- Support for ``AVX512-FP16`` instructions has been added. Internal API Changes -------------------- diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1849,6 +1849,10 @@ TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl") TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl") +// AVX512 fp16 intrinsics +TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16") +TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16") + // generic select intrinsics TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl") TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl") @@ -1859,6 +1863,9 @@ TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl") +TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f") @@ -1868,6 +1875,7 @@ TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl") TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f") +TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16") TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4141,6 +4141,8 @@ def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group; def mavx512er : Flag<["-"], "mavx512er">, Group; def mno_avx512er : Flag<["-"], "mno-avx512er">, Group; +def mavx512fp16 : Flag<["-"], "mavx512fp16">, Group; +def mno_avx512fp16 : Flag<["-"], "mno-avx512fp16">, Group; def mavx512ifma : Flag<["-"], "mavx512ifma">, Group; def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group; def mavx512pf : Flag<["-"], "mavx512pf">, Group; diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -92,6 +92,7 @@ bool HasAVX512CD = false; bool HasAVX512VPOPCNTDQ = false; bool HasAVX512VNNI = false; + bool HasAVX512FP16 = false; bool HasAVX512BF16 = false; bool HasAVX512ER = false; bool HasAVX512PF = false; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -231,6 +231,9 @@ HasAVX512BF16 = true; } else if (Feature == "+avx512er") { HasAVX512ER = true; + } else if (Feature == "+avx512fp16") { + HasAVX512FP16 = true; + HasFloat16 = true; } else if (Feature == "+avx512pf") { HasAVX512PF = true; } else if (Feature == "+avx512dq") { @@ -668,6 +671,8 @@ Builder.defineMacro("__AVX512BF16__"); if (HasAVX512ER) Builder.defineMacro("__AVX512ER__"); + if (HasAVX512FP16) + Builder.defineMacro("__AVX512FP16__"); if (HasAVX512PF) Builder.defineMacro("__AVX512PF__"); if (HasAVX512DQ) @@ -856,6 +861,7 @@ .Case("avx512vnni", true) .Case("avx512bf16", true) .Case("avx512er", true) + .Case("avx512fp16", true) .Case("avx512pf", true) .Case("avx512dq", true) .Case("avx512bitalg", true) @@ -948,6 +954,7 @@ .Case("avx512vnni", HasAVX512VNNI) .Case("avx512bf16", HasAVX512BF16) .Case("avx512er", HasAVX512ER) + .Case("avx512fp16", HasAVX512FP16) .Case("avx512pf", HasAVX512PF) .Case("avx512dq", HasAVX512DQ) .Case("avx512bitalg", HasAVX512BITALG) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -12691,6 +12691,7 @@ case X86::BI__builtin_ia32_storeups512_mask: return EmitX86MaskedStore(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_storesh128_mask: case X86::BI__builtin_ia32_storess128_mask: case X86::BI__builtin_ia32_storesd128_mask: return EmitX86MaskedStore(*this, Ops, Align(1)); @@ -12826,6 +12827,7 @@ case X86::BI__builtin_ia32_loaddqudi512_mask: return EmitX86MaskedLoad(*this, Ops, Align(1)); + case X86::BI__builtin_ia32_loadsh128_mask: case X86::BI__builtin_ia32_loadss128_mask: case X86::BI__builtin_ia32_loadsd128_mask: return EmitX86MaskedLoad(*this, Ops, Align(1)); @@ -13705,6 +13707,9 @@ case X86::BI__builtin_ia32_selectq_128: case X86::BI__builtin_ia32_selectq_256: case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectph_128: + case X86::BI__builtin_ia32_selectph_256: + case X86::BI__builtin_ia32_selectph_512: case X86::BI__builtin_ia32_selectps_128: case X86::BI__builtin_ia32_selectps_256: case X86::BI__builtin_ia32_selectps_512: @@ -13712,6 +13717,7 @@ case X86::BI__builtin_ia32_selectpd_256: case X86::BI__builtin_ia32_selectpd_512: return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]); + case X86::BI__builtin_ia32_selectsh_128: case X86::BI__builtin_ia32_selectss_128: case X86::BI__builtin_ia32_selectsd_128: { Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0); diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -2812,7 +2812,8 @@ Hi = Integer; } else if (k >= BuiltinType::Bool && k <= BuiltinType::LongLong) { Current = Integer; - } else if (k == BuiltinType::Float || k == BuiltinType::Double) { + } else if (k == BuiltinType::Float || k == BuiltinType::Double || + k == BuiltinType::Float16) { Current = SSE; } else if (k == BuiltinType::LongDouble) { const llvm::fltSemantics *LDF = &getTarget().getLongDoubleFormat(); @@ -2943,7 +2944,7 @@ Current = Integer; else if (Size <= 128) Lo = Hi = Integer; - } else if (ET == getContext().FloatTy) { + } else if (ET->isFloat16Type() || ET == getContext().FloatTy) { Current = SSE; } else if (ET == getContext().DoubleTy) { Lo = Hi = SSE; @@ -3396,27 +3397,76 @@ return false; } +/// ContainsHalfAtOffset - Return true if the specified LLVM IR type has a +/// half member at the specified offset. For example, {int,{half}} has a +/// half at offset 4. It is conservatively correct for this routine to return +/// false. +/// FIXME: Merge with ContainsFloatAtOffset +static bool ContainsHalfAtOffset(llvm::Type *IRType, unsigned IROffset, + const llvm::DataLayout &TD) { + // Base case if we find a float. + if (IROffset == 0 && IRType->isHalfTy()) + return true; + + // If this is a struct, recurse into the field at the specified offset. + if (llvm::StructType *STy = dyn_cast(IRType)) { + const llvm::StructLayout *SL = TD.getStructLayout(STy); + unsigned Elt = SL->getElementContainingOffset(IROffset); + IROffset -= SL->getElementOffset(Elt); + return ContainsHalfAtOffset(STy->getElementType(Elt), IROffset, TD); + } + + // If this is an array, recurse into the field at the specified offset. + if (llvm::ArrayType *ATy = dyn_cast(IRType)) { + llvm::Type *EltTy = ATy->getElementType(); + unsigned EltSize = TD.getTypeAllocSize(EltTy); + IROffset -= IROffset / EltSize * EltSize; + return ContainsHalfAtOffset(EltTy, IROffset, TD); + } + + return false; +} /// GetSSETypeAtOffset - Return a type that will be passed by the backend in the /// low 8 bytes of an XMM register, corresponding to the SSE class. llvm::Type *X86_64ABIInfo:: GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, QualType SourceTy, unsigned SourceOffset) const { - // The only three choices we have are either double, <2 x float>, or float. We - // pass as float if the last 4 bytes is just padding. This happens for - // structs that contain 3 floats. - if (BitsContainNoUserData(SourceTy, SourceOffset*8+32, - SourceOffset*8+64, getContext())) - return llvm::Type::getFloatTy(getVMContext()); + // If the high 32 bits are not used, we have three choices. Single half, + // single float or two halfs. + if (BitsContainNoUserData(SourceTy, SourceOffset * 8 + 32, + SourceOffset * 8 + 64, getContext())) { + if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout())) + return llvm::Type::getFloatTy(getVMContext()); + if (ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout())) + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), + 2); + + return llvm::Type::getHalfTy(getVMContext()); + } // We want to pass as <2 x float> if the LLVM IR type contains a float at - // offset+0 and offset+4. Walk the LLVM IR type to find out if this is the + // offset+0 and offset+4. Walk the LLVM IR type to find out if this is the // case. if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) && - ContainsFloatAtOffset(IRType, IROffset+4, getDataLayout())) + ContainsFloatAtOffset(IRType, IROffset + 4, getDataLayout())) return llvm::FixedVectorType::get(llvm::Type::getFloatTy(getVMContext()), 2); + // We want to pass as <4 x half> if the LLVM IR type contains a half at + // offset+0, +2, +4. Walk the LLVM IR type to find out if this is the case. + if (ContainsHalfAtOffset(IRType, IROffset, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout()) && + ContainsHalfAtOffset(IRType, IROffset + 4, getDataLayout())) + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 4); + + // We want to pass as <4 x half> if the LLVM IR type contains a mix of float + // and half. + // FIXME: Do we have a better representation for the mixed type? + if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) || + ContainsFloatAtOffset(IRType, IROffset + 4, getDataLayout())) + return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 4); + return llvm::Type::getDoubleTy(getVMContext()); } @@ -3521,11 +3571,11 @@ // struct. if (HiStart != 8) { // There are usually two sorts of types the ABI generation code can produce - // for the low part of a pair that aren't 8 bytes in size: float or + // for the low part of a pair that aren't 8 bytes in size: half, float or // i8/i16/i32. This can also include pointers when they are 32-bit (X32 and // NaCl). // Promote these to a larger type. - if (Lo->isFloatTy()) + if (Lo->isHalfTy() || Lo->isFloatTy()) Lo = llvm::Type::getDoubleTy(Lo->getContext()); else { assert((Lo->isIntegerTy() || Lo->isPointerTy()) diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -17,6 +17,7 @@ avx512dqintrin.h avx512erintrin.h avx512fintrin.h + avx512fp16intrin.h avx512ifmaintrin.h avx512ifmavlintrin.h avx512pfintrin.h @@ -28,6 +29,7 @@ avx512vlbwintrin.h avx512vlcdintrin.h avx512vldqintrin.h + avx512vlfp16intrin.h avx512vlintrin.h avx512vp2intersectintrin.h avx512vlvp2intersectintrin.h diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -0,0 +1,444 @@ +/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16INTRIN_H +#define __AVX512FP16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64))); +typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64))); +typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(128))) + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { + return __a[0]; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { + return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { + return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { + return (__m256h)__builtin_ia32_undef256(); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { + return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) { + return (__m128h)__builtin_ia32_undef128(); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { + return (__m512h)__builtin_ia32_undef512(); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { + return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, + _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, + _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, + _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, + _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { + return (__m512h)(__v32hf){__h1, __h2, __h3, __h4, __h5, __h6, __h7, + __h8, __h9, __h10, __h11, __h12, __h13, __h14, + __h15, __h16, __h17, __h18, __h19, __h20, __h21, + __h22, __h23, __h24, __h25, __h26, __h27, __h28, + __h29, __h30, __h31, __h32}; +} + +#define _mm512_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, __h9, \ + __h10, __h11, __h12, __h13, __h14, __h15, __h16, __h17, \ + __h18, __h19, __h20, __h21, __h22, __h23, __h24, __h25, \ + __h26, __h27, __h28, __h29, __h30, __h31, __h32) \ + _mm512_set_ph((__h32), (__h31), (__h30), (__h29), (__h28), (__h27), (__h26), \ + (__h25), (__h24), (__h23), (__h22), (__h21), (__h20), (__h19), \ + (__h18), (__h17), (__h16), (__h15), (__h14), (__h13), (__h12), \ + (__h11), (__h10), (__h9), (__h8), (__h7), (__h6), (__h5), \ + (__h4), (__h3), (__h2), (__h1)) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) { + return (__m128)__a; +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) { + return (__m256)__a; +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) { + return (__m512)__a; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) { + return (__m128d)__a; +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) { + return (__m256d)__a; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) { + return (__m512d)__a; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) { + return (__m128i)__a; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_castph_si256(__m256h __a) { + return (__m256i)__a; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castph_si512(__m512h __a) { + return (__m512i)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_castsi256_ph(__m256i __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castsi512_ph(__m512i __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_castph256_ph128(__m256h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_castph512_ph128(__m512h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_castph512_ph256(__m512h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_castph128_ph256(__m128h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castph128_ph512(__m128h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castph256_ph512(__m256h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +/// Constructs a 256-bit floating-point vector of [16 x half] from a +/// 128-bit floating-point vector of [8 x half]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x half]. +/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_zextph128_ph256(__m128h __a) { + return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// Constructs a 512-bit floating-point vector of [32 x half] from a +/// 128-bit floating-point vector of [8 x half]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x half]. +/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_zextph128_ph512(__m128h __a) { + return __builtin_shufflevector( + __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// Constructs a 512-bit floating-point vector of [32 x half] from a +/// 256-bit floating-point vector of [16 x half]. The lower 256 bits +/// contain the value of the source vector. The upper 256 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x half]. +/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_zextph256_ph512(__m256h __a) { + return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { + return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); +} + +// loads with vmovsh: +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { + struct __mm_load_sh_struct { + _Float16 __u; + } __attribute__((__packed__, __may_alias__)); + _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u; + return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) { + __m128h src = (__v8hf)__builtin_shufflevector( + (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8); + + return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_load_sh(__mmask8 __U, const void *__A) { + return (__m128h)__builtin_ia32_loadsh128_mask( + (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_load_ph(void const *__p) { + return *(const __m512h *)__p; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_load_ph(void const *__p) { + return *(const __m256h *)__p; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) { + return *(const __m128h *)__p; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_loadu_ph(void const *__p) { + struct __loadu_ph { + __m512h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_loadu_ph(void const *__p) { + struct __loadu_ph { + __m256h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) { + struct __loadu_ph { + __m128h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +// stores with vmovsh: +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp, + __m128h __a) { + struct __mm_store_sh_struct { + _Float16 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sh_struct *)__dp)->__u = __a[0]; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W, + __mmask8 __U, + __m128h __A) { + __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P, + __m512h __A) { + *(__m512h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P, + __m256h __A) { + *(__m256h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P, + __m128h __A) { + *(__m128h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P, + __m512h __A) { + struct __storeu_ph { + __m512h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P, + __m256h __A) { + struct __storeu_ph { + __m256h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, + __m128h __A) { + struct __storeu_ph { + __m128h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +// moves with vmovsh: +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, + __m128h __b) { + __a[0] = __b[0]; + return __a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), + _mm_setzero_ph()); +} + +// vmovw: +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) { + return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { + __v8hi __b = (__v8hi)__a; + return __b[0]; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, + (__v32hf)__A); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { + return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_ph(__m512i __A, __m512h __B) { + return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS512 + +#endif diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h new file mode 100644 --- /dev/null +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -0,0 +1,119 @@ +/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VLFP16INTRIN_H +#define __AVX512VLFP16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512fp16, avx512vl"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512fp16, avx512vl"), \ + __min_vector_width__(128))) + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { + return __a[0]; +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { + return __a[0]; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { + return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { + return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { + return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 +_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { + return (__m128h)(__v8hf){__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 +_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { + return (__m256h)(__v16hf){__h1, __h2, __h3, __h4, __h5, __h6, + __h7, __h8, __h9, __h10, __h11, __h12, + __h13, __h14, __h15, __h16}; +} + +#define _mm_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8) \ + _mm_set_ph((__h8), (__h7), (__h6), (__h5), (__h4), (__h3), (__h2), (__h1)) + +#define _mm256_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, __h9, \ + __h10, __h11, __h12, __h13, __h14, __h15, __h16) \ + _mm256_set_ph((__h16), (__h15), (__h14), (__h13), (__h12), (__h11), (__h10), \ + (__h9), (__h8), (__h7), (__h6), (__h5), (__h4), (__h3), \ + (__h2), (__h1)) + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { + return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { + return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, + __m128h __A, + __m128h __W) { + return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, + (__v8hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, + (__v16hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { + return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { + return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_permutexvar_ph(__m128i __A, __m128h __B) { + return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_ph(__m256i __A, __m256h __B) { + return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h --- a/clang/lib/Headers/cpuid.h +++ b/clang/lib/Headers/cpuid.h @@ -195,6 +195,7 @@ #define bit_PCONFIG 0x00040000 #define bit_IBT 0x00100000 #define bit_AMXBF16 0x00400000 +#define bit_AVX512FP16 0x00800000 #define bit_AMXTILE 0x01000000 #define bit_AMXINT8 0x02000000 diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -210,6 +210,20 @@ #include #endif +/* + * FIXME: _Float16 type is legal only when HW support float16 operation. + * We use __AVX512FP16__ to identify if float16 is supported or not, so + * when float16 is not supported, the related header is not included. + * + */ +#if defined(__AVX512FP16__) +#include +#endif + +#if defined(__AVX512FP16__) && defined(__AVX512VL__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AVX512BF16__) #include diff --git a/clang/test/CodeGen/X86/avx512fp16-abi.c b/clang/test/CodeGen/X86/avx512fp16-abi.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512fp16-abi.c @@ -0,0 +1,149 @@ +// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -target-feature +avx512fp16 < %s | FileCheck %s --check-prefixes=CHECK + +struct half1 { + _Float16 a; +}; + +struct half1 h1(_Float16 a) { + // CHECK: define{{.*}}half @h1 + struct half1 x; + x.a = a; + return x; +} + +struct half2 { + _Float16 a; + _Float16 b; +}; + +struct half2 h2(_Float16 a, _Float16 b) { + // CHECK: define{{.*}}<2 x half> @h2 + struct half2 x; + x.a = a; + x.b = b; + return x; +} + +struct half3 { + _Float16 a; + _Float16 b; + _Float16 c; +}; + +struct half3 h3(_Float16 a, _Float16 b, _Float16 c) { + // CHECK: define{{.*}}<4 x half> @h3 + struct half3 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct half4 { + _Float16 a; + _Float16 b; + _Float16 c; + _Float16 d; +}; + +struct half4 h4(_Float16 a, _Float16 b, _Float16 c, _Float16 d) { + // CHECK: define{{.*}}<4 x half> @h4 + struct half4 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct floathalf { + float a; + _Float16 b; +}; + +struct floathalf fh(float a, _Float16 b) { + // CHECK: define{{.*}}<4 x half> @fh + struct floathalf x; + x.a = a; + x.b = b; + return x; +} + +struct floathalf2 { + float a; + _Float16 b; + _Float16 c; +}; + +struct floathalf2 fh2(float a, _Float16 b, _Float16 c) { + // CHECK: define{{.*}}<4 x half> @fh2 + struct floathalf2 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct halffloat { + _Float16 a; + float b; +}; + +struct halffloat hf(_Float16 a, float b) { + // CHECK: define{{.*}}<4 x half> @hf + struct halffloat x; + x.a = a; + x.b = b; + return x; +} + +struct half2float { + _Float16 a; + _Float16 b; + float c; +}; + +struct half2float h2f(_Float16 a, _Float16 b, float c) { + // CHECK: define{{.*}}<4 x half> @h2f + struct half2float x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct floathalf3 { + float a; + _Float16 b; + _Float16 c; + _Float16 d; +}; + +struct floathalf3 fh3(float a, _Float16 b, _Float16 c, _Float16 d) { + // CHECK: define{{.*}}{ <4 x half>, half } @fh3 + struct floathalf3 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct half5 { + _Float16 a; + _Float16 b; + _Float16 c; + _Float16 d; + _Float16 e; +}; + +struct half5 h5(_Float16 a, _Float16 b, _Float16 c, _Float16 d, _Float16 e) { + // CHECK: define{{.*}}{ <4 x half>, half } @h5 + struct half5 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + x.e = e; + return x; +} diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c @@ -0,0 +1,526 @@ +// RUN: %clang_cc1 -ffreestanding -flax-vector-conversions=none %s -triple=x86_64-unknown-unknown -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +_Float16 test_mm512_cvtsh_h(__m512h __A) { + // CHECK-LABEL: @test_mm512_cvtsh_h + // CHECK: extractelement <32 x half> %{{.*}}, i32 0 + return _mm512_cvtsh_h(__A); +} + +__m128h test_mm_setzero_ph() { + // CHECK-LABEL: @test_mm_setzero_ph + // CHECK: zeroinitializer + return _mm_setzero_ph(); +} + +__m256h test_mm256_setzero_ph() { + // CHECK-LABEL: @test_mm256_setzero_ph + // CHECK: zeroinitializer + return _mm256_setzero_ph(); +} + +__m256h test_mm256_undefined_ph() { + // CHECK-LABEL: @test_mm256_undefined_ph + // CHECK: ret <16 x half> zeroinitializer + return _mm256_undefined_ph(); +} + +__m512h test_mm512_setzero_ph() { + // CHECK-LABEL: @test_mm512_setzero_ph + // CHECK: zeroinitializer + return _mm512_setzero_ph(); +} + +__m128h test_mm_undefined_ph() { + // CHECK-LABEL: @test_mm_undefined_ph + // CHECK: ret <8 x half> zeroinitializer + return _mm_undefined_ph(); +} + +__m512h test_mm512_undefined_ph() { + // CHECK-LABEL: @test_mm512_undefined_ph + // CHECK: ret <32 x half> zeroinitializer + return _mm512_undefined_ph(); +} + +__m512h test_mm512_set1_ph(_Float16 h) { + // CHECK-LABEL: @test_mm512_set1_ph + // CHECK: insertelement <32 x half> {{.*}}, i32 0 + // CHECK: insertelement <32 x half> {{.*}}, i32 1 + // CHECK: insertelement <32 x half> {{.*}}, i32 2 + // CHECK: insertelement <32 x half> {{.*}}, i32 3 + // CHECK: insertelement <32 x half> {{.*}}, i32 4 + // CHECK: insertelement <32 x half> {{.*}}, i32 5 + // CHECK: insertelement <32 x half> {{.*}}, i32 6 + // CHECK: insertelement <32 x half> {{.*}}, i32 7 + // CHECK: insertelement <32 x half> {{.*}}, i32 8 + // CHECK: insertelement <32 x half> {{.*}}, i32 9 + // CHECK: insertelement <32 x half> {{.*}}, i32 10 + // CHECK: insertelement <32 x half> {{.*}}, i32 11 + // CHECK: insertelement <32 x half> {{.*}}, i32 12 + // CHECK: insertelement <32 x half> {{.*}}, i32 13 + // CHECK: insertelement <32 x half> {{.*}}, i32 14 + // CHECK: insertelement <32 x half> {{.*}}, i32 15 + // CHECK: insertelement <32 x half> {{.*}}, i32 16 + // CHECK: insertelement <32 x half> {{.*}}, i32 17 + // CHECK: insertelement <32 x half> {{.*}}, i32 18 + // CHECK: insertelement <32 x half> {{.*}}, i32 19 + // CHECK: insertelement <32 x half> {{.*}}, i32 20 + // CHECK: insertelement <32 x half> {{.*}}, i32 21 + // CHECK: insertelement <32 x half> {{.*}}, i32 22 + // CHECK: insertelement <32 x half> {{.*}}, i32 23 + // CHECK: insertelement <32 x half> {{.*}}, i32 24 + // CHECK: insertelement <32 x half> {{.*}}, i32 25 + // CHECK: insertelement <32 x half> {{.*}}, i32 26 + // CHECK: insertelement <32 x half> {{.*}}, i32 27 + // CHECK: insertelement <32 x half> {{.*}}, i32 28 + // CHECK: insertelement <32 x half> {{.*}}, i32 29 + // CHECK: insertelement <32 x half> {{.*}}, i32 30 + // CHECK: insertelement <32 x half> {{.*}}, i32 31 + return _mm512_set1_ph(h); +} + +__m512h test_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, + _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, + _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, + _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, + _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { + // CHECK-LABEL: @test_mm512_set_ph + // CHECK: insertelement <32 x half> {{.*}}, i32 0 + // CHECK: insertelement <32 x half> {{.*}}, i32 1 + // CHECK: insertelement <32 x half> {{.*}}, i32 2 + // CHECK: insertelement <32 x half> {{.*}}, i32 3 + // CHECK: insertelement <32 x half> {{.*}}, i32 4 + // CHECK: insertelement <32 x half> {{.*}}, i32 5 + // CHECK: insertelement <32 x half> {{.*}}, i32 6 + // CHECK: insertelement <32 x half> {{.*}}, i32 7 + // CHECK: insertelement <32 x half> {{.*}}, i32 8 + // CHECK: insertelement <32 x half> {{.*}}, i32 9 + // CHECK: insertelement <32 x half> {{.*}}, i32 10 + // CHECK: insertelement <32 x half> {{.*}}, i32 11 + // CHECK: insertelement <32 x half> {{.*}}, i32 12 + // CHECK: insertelement <32 x half> {{.*}}, i32 13 + // CHECK: insertelement <32 x half> {{.*}}, i32 14 + // CHECK: insertelement <32 x half> {{.*}}, i32 15 + // CHECK: insertelement <32 x half> {{.*}}, i32 16 + // CHECK: insertelement <32 x half> {{.*}}, i32 17 + // CHECK: insertelement <32 x half> {{.*}}, i32 18 + // CHECK: insertelement <32 x half> {{.*}}, i32 19 + // CHECK: insertelement <32 x half> {{.*}}, i32 20 + // CHECK: insertelement <32 x half> {{.*}}, i32 21 + // CHECK: insertelement <32 x half> {{.*}}, i32 22 + // CHECK: insertelement <32 x half> {{.*}}, i32 23 + // CHECK: insertelement <32 x half> {{.*}}, i32 24 + // CHECK: insertelement <32 x half> {{.*}}, i32 25 + // CHECK: insertelement <32 x half> {{.*}}, i32 26 + // CHECK: insertelement <32 x half> {{.*}}, i32 27 + // CHECK: insertelement <32 x half> {{.*}}, i32 28 + // CHECK: insertelement <32 x half> {{.*}}, i32 29 + // CHECK: insertelement <32 x half> {{.*}}, i32 30 + // CHECK: insertelement <32 x half> {{.*}}, i32 31 + return _mm512_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, + __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16, + __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24, + __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32); +} + +__m512h test_mm512_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, + _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, + _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, + _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, + _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { + // CHECK-LABEL: @test_mm512_setr_ph + // CHECK: insertelement <32 x half> {{.*}}, i32 0 + // CHECK: insertelement <32 x half> {{.*}}, i32 1 + // CHECK: insertelement <32 x half> {{.*}}, i32 2 + // CHECK: insertelement <32 x half> {{.*}}, i32 3 + // CHECK: insertelement <32 x half> {{.*}}, i32 4 + // CHECK: insertelement <32 x half> {{.*}}, i32 5 + // CHECK: insertelement <32 x half> {{.*}}, i32 6 + // CHECK: insertelement <32 x half> {{.*}}, i32 7 + // CHECK: insertelement <32 x half> {{.*}}, i32 8 + // CHECK: insertelement <32 x half> {{.*}}, i32 9 + // CHECK: insertelement <32 x half> {{.*}}, i32 10 + // CHECK: insertelement <32 x half> {{.*}}, i32 11 + // CHECK: insertelement <32 x half> {{.*}}, i32 12 + // CHECK: insertelement <32 x half> {{.*}}, i32 13 + // CHECK: insertelement <32 x half> {{.*}}, i32 14 + // CHECK: insertelement <32 x half> {{.*}}, i32 15 + // CHECK: insertelement <32 x half> {{.*}}, i32 16 + // CHECK: insertelement <32 x half> {{.*}}, i32 17 + // CHECK: insertelement <32 x half> {{.*}}, i32 18 + // CHECK: insertelement <32 x half> {{.*}}, i32 19 + // CHECK: insertelement <32 x half> {{.*}}, i32 20 + // CHECK: insertelement <32 x half> {{.*}}, i32 21 + // CHECK: insertelement <32 x half> {{.*}}, i32 22 + // CHECK: insertelement <32 x half> {{.*}}, i32 23 + // CHECK: insertelement <32 x half> {{.*}}, i32 24 + // CHECK: insertelement <32 x half> {{.*}}, i32 25 + // CHECK: insertelement <32 x half> {{.*}}, i32 26 + // CHECK: insertelement <32 x half> {{.*}}, i32 27 + // CHECK: insertelement <32 x half> {{.*}}, i32 28 + // CHECK: insertelement <32 x half> {{.*}}, i32 29 + // CHECK: insertelement <32 x half> {{.*}}, i32 30 + // CHECK: insertelement <32 x half> {{.*}}, i32 31 + return _mm512_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, + __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16, + __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24, + __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32); +} + +__m128 test_mm_castph_ps(__m128h A) { + // CHECK-LABEL: test_mm_castph_ps + // CHECK: bitcast <8 x half> %{{.*}} to <4 x float> + return _mm_castph_ps(A); +} + +__m256 test_mm256_castph_ps(__m256h A) { + // CHECK-LABEL: test_mm256_castph_ps + // CHECK: bitcast <16 x half> %{{.*}} to <8 x float> + return _mm256_castph_ps(A); +} + +__m512 test_mm512_castph_ps(__m512h A) { + // CHECK-LABEL: test_mm512_castph_ps + // CHECK: bitcast <32 x half> %{{.*}} to <16 x float> + return _mm512_castph_ps(A); +} + +__m128d test_mm_castph_pd(__m128h A) { + // CHECK-LABEL: test_mm_castph_pd + // CHECK: bitcast <8 x half> %{{.*}} to <2 x double> + return _mm_castph_pd(A); +} + +__m256d test_mm256_castph_pd(__m256h A) { + // CHECK-LABEL: test_mm256_castph_pd + // CHECK: bitcast <16 x half> %{{.*}} to <4 x double> + return _mm256_castph_pd(A); +} + +__m512d test_mm512_castph_pd(__m512h A) { + // CHECK-LABEL: test_mm512_castph_pd + // CHECK: bitcast <32 x half> %{{.*}} to <8 x double> + return _mm512_castph_pd(A); +} + +__m128i test_mm_castph_si128(__m128h A) { + // CHECK-LABEL: test_mm_castph_si128 + // CHECK: bitcast <8 x half> %{{.*}} to <2 x i64> + return _mm_castph_si128(A); +} + +__m256i test_mm256_castph_si256(__m256h A) { + // CHECK-LABEL: test_mm256_castph_si256 + // CHECK: bitcast <16 x half> %{{.*}} to <4 x i64> + return _mm256_castph_si256(A); +} + +__m512i test_mm512_castph_si512(__m512h A) { + // CHECK-LABEL: test_mm512_castph_si512 + // CHECK: bitcast <32 x half> %{{.*}} to <8 x i64> + return _mm512_castph_si512(A); +} + +__m128h test_mm_castps_ph(__m128 A) { + // CHECK-LABEL: test_mm_castps_ph + // CHECK: bitcast <4 x float> %{{.*}} to <8 x half> + return _mm_castps_ph(A); +} + +__m256h test_mm256_castps_ph(__m256 A) { + // CHECK-LABEL: test_mm256_castps_ph + // CHECK: bitcast <8 x float> %{{.*}} to <16 x half> + return _mm256_castps_ph(A); +} + +__m512h test_mm512_castps_ph(__m512 A) { + // CHECK-LABEL: test_mm512_castps_ph + // CHECK: bitcast <16 x float> %{{.*}} to <32 x half> + return _mm512_castps_ph(A); +} + +__m128h test_mm_castpd_ph(__m128d A) { + // CHECK-LABEL: test_mm_castpd_ph + // CHECK: bitcast <2 x double> %{{.*}} to <8 x half> + return _mm_castpd_ph(A); +} + +__m256h test_mm256_castpd_ph(__m256d A) { + // CHECK-LABEL: test_mm256_castpd_ph + // CHECK: bitcast <4 x double> %{{.*}} to <16 x half> + return _mm256_castpd_ph(A); +} + +__m512h test_mm512_castpd_ph(__m512d A) { + // CHECK-LABEL: test_mm512_castpd_ph + // CHECK: bitcast <8 x double> %{{.*}} to <32 x half> + return _mm512_castpd_ph(A); +} + +__m128h test_mm_castsi128_ph(__m128i A) { + // CHECK-LABEL: test_mm_castsi128_ph + // CHECK: bitcast <2 x i64> %{{.*}} to <8 x half> + return _mm_castsi128_ph(A); +} + +__m256h test_mm256_castsi256_ph(__m256i A) { + // CHECK-LABEL: test_mm256_castsi256_ph + // CHECK: bitcast <4 x i64> %{{.*}} to <16 x half> + return _mm256_castsi256_ph(A); +} + +__m512h test_mm512_castsi512_ph(__m512i A) { + // CHECK-LABEL: test_mm512_castsi512_ph + // CHECK: bitcast <8 x i64> %{{.*}} to <32 x half> + return _mm512_castsi512_ph(A); +} + +__m128h test_mm256_castph256_ph128(__m256h __a) { + // CHECK-LABEL: test_mm256_castph256_ph128 + // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x i32> + return _mm256_castph256_ph128(__a); +} + +__m128h test_mm512_castph512_ph128(__m512h __a) { + // CHECK-LABEL: test_mm512_castph512_ph128 + // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <8 x i32> + return _mm512_castph512_ph128(__a); +} + +__m256h test_mm512_castph512_ph256(__m512h __a) { + // CHECK-LABEL: test_mm512_castph512_ph256 + // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <16 x i32> + return _mm512_castph512_ph256(__a); +} + +__m256h test_mm256_castph128_ph256(__m128h __a) { + // CHECK-LABEL: test_mm256_castph128_ph256 + // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x i32> + return _mm256_castph128_ph256(__a); +} + +__m512h test_mm512_castph128_ph512(__m128h __a) { + // CHECK-LABEL: test_mm512_castph128_ph512 + // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> %{{.*}}, <32 x i32> + return _mm512_castph128_ph512(__a); +} + +__m512h test_mm512_castph256_ph512(__m256h __a) { + // CHECK-LABEL: test_mm512_castph256_ph512 + // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> %{{.*}}, <32 x i32> + return _mm512_castph256_ph512(__a); +} + +__m256h test_mm256_zextph128_ph256(__m128h __a) { + // CHECK-LABEL: test_mm256_zextph128_ph256 + // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <16 x i32> + return _mm256_zextph128_ph256(__a); +} + +__m512h test_mm512_zextph128_ph512(__m128h __a) { + // CHECK-LABEL: test_mm512_zextph128_ph512 + // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <32 x i32> + return _mm512_zextph128_ph512(__a); +} + +__m512h test_mm512_zextph256_ph512(__m256h __a) { + // CHECK-LABEL: test_mm512_zextph256_ph512 + // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> {{.*}}, <32 x i32> + return _mm512_zextph256_ph512(__a); +} + +__m512h test_mm512_abs_ph(__m512h a) { + // CHECK-LABEL: @test_mm512_abs_ph + // CHECK: and <16 x i32> + return _mm512_abs_ph(a); +} + +// VMOVSH + +__m128h test_mm_load_sh(void const *A) { + // CHECK-LABEL: test_mm_load_sh + // CHECK: load half, half* %{{.*}}, align 1{{$}} + return _mm_load_sh(A); +} + +__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) { + // CHECK-LABEL: @test_mm_mask_load_sh + // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}}) + return _mm_mask_load_sh(__A, __U, __W); +} + +__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) { + // CHECK-LABEL: @test_mm_maskz_load_sh + // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}}) + return _mm_maskz_load_sh(__U, __W); +} + +__m512h test_mm512_load_ph(void *p) { + // CHECK-LABEL: @test_mm512_load_ph + // CHECK: load <32 x half>, <32 x half>* %{{.*}}, align 64 + return _mm512_load_ph(p); +} + +__m256h test_mm256_load_ph(void *p) { + // CHECK-LABEL: @test_mm256_load_ph + // CHECK: load <16 x half>, <16 x half>* %{{.*}}, align 32 + return _mm256_load_ph(p); +} + +__m128h test_mm_load_ph(void *p) { + // CHECK-LABEL: @test_mm_load_ph + // CHECK: load <8 x half>, <8 x half>* %{{.*}}, align 16 + return _mm_load_ph(p); +} + +__m512h test_mm512_loadu_ph(void *p) { + // CHECK-LABEL: @test_mm512_loadu_ph + // CHECK: load <32 x half>, <32 x half>* {{.*}}, align 1{{$}} + return _mm512_loadu_ph(p); +} + +__m256h test_mm256_loadu_ph(void *p) { + // CHECK-LABEL: @test_mm256_loadu_ph + // CHECK: load <16 x half>, <16 x half>* {{.*}}, align 1{{$}} + return _mm256_loadu_ph(p); +} + +__m128h test_mm_loadu_ph(void *p) { + // CHECK-LABEL: @test_mm_loadu_ph + // CHECK: load <8 x half>, <8 x half>* {{.*}}, align 1{{$}} + return _mm_loadu_ph(p); +} + +void test_mm_store_sh(void *A, __m128h B) { + // CHECK-LABEL: test_mm_store_sh + // CHECK: extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: store half %{{.*}}, half* %{{.*}}, align 1{{$}} + _mm_store_sh(A, B); +} + +void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) { + // CHECK-LABEL: @test_mm_mask_store_sh + // CHECK: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %{{.*}}, <8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}) + _mm_mask_store_sh(__P, __U, __A); +} + +void test_mm512_store_ph(void *p, __m512h a) { + // CHECK-LABEL: @test_mm512_store_ph + // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 64 + _mm512_store_ph(p, a); +} + +void test_mm256_store_ph(void *p, __m256h a) { + // CHECK-LABEL: @test_mm256_store_ph + // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 32 + _mm256_store_ph(p, a); +} + +void test_mm_store_ph(void *p, __m128h a) { + // CHECK-LABEL: @test_mm_store_ph + // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 16 + _mm_store_ph(p, a); +} + +void test_mm512_storeu_ph(void *p, __m512h a) { + // CHECK-LABEL: @test_mm512_storeu_ph + // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm512_storeu_ph(p, a); +} + +void test_mm256_storeu_ph(void *p, __m256h a) { + // CHECK-LABEL: @test_mm256_storeu_ph + // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm256_storeu_ph(p, a); +} + +void test_mm_storeu_ph(void *p, __m128h a) { + // CHECK-LABEL: @test_mm_storeu_ph + // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm_storeu_ph(p, a); +} + +__m128h test_mm_move_sh(__m128h A, __m128h B) { + // CHECK-LABEL: test_mm_move_sh + // CHECK: extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0 + return _mm_move_sh(A, B); +} + +__m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_mask_move_sh + // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]] + // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 + return _mm_mask_move_sh(__W, __U, __A, __B); +} + +__m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) { + // CHECK-LABEL: @test_mm_maskz_move_sh + // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0 + // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0 + // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0 + // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0 + // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1> + // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0 + // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]] + // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0 + return _mm_maskz_move_sh(__U, __A, __B); +} + +short test_mm_cvtsi128_si16(__m128i A) { + // CHECK-LABEL: test_mm_cvtsi128_si16 + // CHECK: extractelement <8 x i16> %{{.*}}, i32 0 + return _mm_cvtsi128_si16(A); +} + +__m128i test_mm_cvtsi16_si128(short A) { + // CHECK-LABEL: test_mm_cvtsi16_si128 + // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3 + return _mm_cvtsi16_si128(A); +} + +__m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { + // CHECK-LABEL: @test_mm512_mask_blend_ph + // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1> + // CHECK: %{{.*}} = select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}} + return _mm512_mask_blend_ph(__U, __A, __W); +} + +__m512h test_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { + // CHECK-LABEL: @test_mm512_permutex2var_ph + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half> + return _mm512_permutex2var_ph(__A, __I, __B); +} + +__m512h test_mm512_permutexvar_epi16(__m512i __A, __m512h __B) { + // CHECK-LABEL: @test_mm512_permutexvar_epi16 + // CHECK: %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16> + // CHECK: %{{.*}} = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half> + return _mm512_permutexvar_ph(__A, __B); +} diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c @@ -0,0 +1,204 @@ +// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +_Float16 test_mm_cvtsh_h(__m128h __A) { + // CHECK-LABEL: @test_mm_cvtsh_h + // CHECK: extractelement <8 x half> %{{.*}}, i32 0 + return _mm_cvtsh_h(__A); +} + +_Float16 test_mm256_cvtsh_h(__m256h __A) { + // CHECK-LABEL: @test_mm256_cvtsh_h + // CHECK: extractelement <16 x half> %{{.*}}, i32 0 + return _mm256_cvtsh_h(__A); +} + +__m128h test_mm_set_sh(_Float16 __h) { + // CHECK-LABEL: @test_mm_set_sh + // CHECK: insertelement <8 x half> {{.*}}, i32 0 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 1 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 2 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 3 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 4 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 5 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 6 + // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 7 + return _mm_set_sh(__h); +} + +__m128h test_mm_set1_ph(_Float16 h) { + // CHECK-LABEL: @test_mm_set1_ph + // CHECK: insertelement <8 x half> {{.*}}, i32 0 + // CHECK: insertelement <8 x half> {{.*}}, i32 1 + // CHECK: insertelement <8 x half> {{.*}}, i32 2 + // CHECK: insertelement <8 x half> {{.*}}, i32 3 + // CHECK: insertelement <8 x half> {{.*}}, i32 4 + // CHECK: insertelement <8 x half> {{.*}}, i32 5 + // CHECK: insertelement <8 x half> {{.*}}, i32 6 + // CHECK: insertelement <8 x half> {{.*}}, i32 7 + return _mm_set1_ph(h); +} + +__m256h test_mm256_set1_ph(_Float16 h) { + // CHECK-LABEL: @test_mm256_set1_ph + // CHECK: insertelement <16 x half> {{.*}}, i32 0 + // CHECK: insertelement <16 x half> {{.*}}, i32 1 + // CHECK: insertelement <16 x half> {{.*}}, i32 2 + // CHECK: insertelement <16 x half> {{.*}}, i32 3 + // CHECK: insertelement <16 x half> {{.*}}, i32 4 + // CHECK: insertelement <16 x half> {{.*}}, i32 5 + // CHECK: insertelement <16 x half> {{.*}}, i32 6 + // CHECK: insertelement <16 x half> {{.*}}, i32 7 + // CHECK: insertelement <16 x half> {{.*}}, i32 8 + // CHECK: insertelement <16 x half> {{.*}}, i32 9 + // CHECK: insertelement <16 x half> {{.*}}, i32 10 + // CHECK: insertelement <16 x half> {{.*}}, i32 11 + // CHECK: insertelement <16 x half> {{.*}}, i32 12 + // CHECK: insertelement <16 x half> {{.*}}, i32 13 + // CHECK: insertelement <16 x half> {{.*}}, i32 14 + // CHECK: insertelement <16 x half> {{.*}}, i32 15 + return _mm256_set1_ph(h); +} + +__m128h test_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { + // CHECK-LABEL: @test_mm_set_ph + // CHECK: insertelement <8 x half> {{.*}}, i32 0 + // CHECK: insertelement <8 x half> {{.*}}, i32 1 + // CHECK: insertelement <8 x half> {{.*}}, i32 2 + // CHECK: insertelement <8 x half> {{.*}}, i32 3 + // CHECK: insertelement <8 x half> {{.*}}, i32 4 + // CHECK: insertelement <8 x half> {{.*}}, i32 5 + // CHECK: insertelement <8 x half> {{.*}}, i32 6 + // CHECK: insertelement <8 x half> {{.*}}, i32 7 + return _mm_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8); +} + +__m256h test_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { + // CHECK-LABEL: @test_mm256_set_ph + // CHECK: insertelement <16 x half> {{.*}}, i32 0 + // CHECK: insertelement <16 x half> {{.*}}, i32 1 + // CHECK: insertelement <16 x half> {{.*}}, i32 2 + // CHECK: insertelement <16 x half> {{.*}}, i32 3 + // CHECK: insertelement <16 x half> {{.*}}, i32 4 + // CHECK: insertelement <16 x half> {{.*}}, i32 5 + // CHECK: insertelement <16 x half> {{.*}}, i32 6 + // CHECK: insertelement <16 x half> {{.*}}, i32 7 + // CHECK: insertelement <16 x half> {{.*}}, i32 8 + // CHECK: insertelement <16 x half> {{.*}}, i32 9 + // CHECK: insertelement <16 x half> {{.*}}, i32 10 + // CHECK: insertelement <16 x half> {{.*}}, i32 11 + // CHECK: insertelement <16 x half> {{.*}}, i32 12 + // CHECK: insertelement <16 x half> {{.*}}, i32 13 + // CHECK: insertelement <16 x half> {{.*}}, i32 14 + // CHECK: insertelement <16 x half> {{.*}}, i32 15 + return _mm256_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, + __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16); +} + +__m128h test_mm_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { + // CHECK-LABEL: @test_mm_setr_ph + // CHECK: insertelement <8 x half> {{.*}}, i32 0 + // CHECK: insertelement <8 x half> {{.*}}, i32 1 + // CHECK: insertelement <8 x half> {{.*}}, i32 2 + // CHECK: insertelement <8 x half> {{.*}}, i32 3 + // CHECK: insertelement <8 x half> {{.*}}, i32 4 + // CHECK: insertelement <8 x half> {{.*}}, i32 5 + // CHECK: insertelement <8 x half> {{.*}}, i32 6 + // CHECK: insertelement <8 x half> {{.*}}, i32 7 + return _mm_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8); +} + +__m256h test_mm256_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { + // CHECK-LABEL: @test_mm256_setr_ph + // CHECK: insertelement <16 x half> {{.*}}, i32 0 + // CHECK: insertelement <16 x half> {{.*}}, i32 1 + // CHECK: insertelement <16 x half> {{.*}}, i32 2 + // CHECK: insertelement <16 x half> {{.*}}, i32 3 + // CHECK: insertelement <16 x half> {{.*}}, i32 4 + // CHECK: insertelement <16 x half> {{.*}}, i32 5 + // CHECK: insertelement <16 x half> {{.*}}, i32 6 + // CHECK: insertelement <16 x half> {{.*}}, i32 7 + // CHECK: insertelement <16 x half> {{.*}}, i32 8 + // CHECK: insertelement <16 x half> {{.*}}, i32 9 + // CHECK: insertelement <16 x half> {{.*}}, i32 10 + // CHECK: insertelement <16 x half> {{.*}}, i32 11 + // CHECK: insertelement <16 x half> {{.*}}, i32 12 + // CHECK: insertelement <16 x half> {{.*}}, i32 13 + // CHECK: insertelement <16 x half> {{.*}}, i32 14 + // CHECK: insertelement <16 x half> {{.*}}, i32 15 + return _mm256_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, + __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16); +} + +__m128h test_mm_abs_ph(__m128h a) { + // CHECK-LABEL: @test_mm_abs_ph + // CHECK: and <4 x i32> + return _mm_abs_ph(a); +} + +__m256h test_mm256_abs_ph(__m256h a) { + // CHECK-LABEL: @test_mm256_abs_ph + // CHECK: and <8 x i32> + return _mm256_abs_ph(a); +} + +__m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) { + // CHECK-LABEL: @test_mm_mask_blend_ph + // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> + // CHECK: %{{.*}} = select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}} + return _mm_mask_blend_ph(__U, __A, __W); +} + +__m256h test_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { + // CHECK-LABEL: @test_mm256_mask_blend_ph + // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> + // CHECK: %{{.*}} = select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}} + return _mm256_mask_blend_ph(__U, __A, __W); +} + +__m128h test_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { + // CHECK-LABEL: @test_mm_permutex2var_ph + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x half> + return _mm_permutex2var_ph(__A, __I, __B); +} + +__m256h test_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { + // CHECK-LABEL: @test_mm256_permutex2var_ph + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half> + return _mm256_permutex2var_ph(__A, __I, __B); +} + +__m128h test_mm_permutexvar_ph(__m128i __A, __m128h __B) { + // CHECK-LABEL: @test_mm_permutexvar_ph + // CHECK: %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16> + // CHECK: %{{.*}} = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x half> + return _mm_permutexvar_ph(__A, __B); +} + +__m256h test_mm256_permutexvar_ph(__m256i __A, __m256h __B) { + // CHECK-LABEL: @test_mm256_permutexvar_ph + // CHECK: %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16> + // CHECK: %{{.*}} = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) + // CHECK: %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half> + return _mm256_permutexvar_ph(__A, __B); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -54,9 +54,9 @@ // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" -// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" // CHECK-NOT: tune-cpu // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -293,3 +293,8 @@ // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s // AVX-VNNI: "-target-feature" "+avxvnni" // NO-AVX-VNNI: "-target-feature" "-avxvnni" + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512fp16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX512FP16 %s +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx512fp16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX512FP16 %s +// AVX512FP16: "-target-feature" "+avx512fp16" +// NO-AVX512FP16: "-target-feature" "-avx512fp16" diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -1656,6 +1656,7 @@ // CHECK_SPR_M32: #define __AVX512BW__ 1 // CHECK_SPR_M32: #define __AVX512CD__ 1 // CHECK_SPR_M32: #define __AVX512DQ__ 1 +// CHECK_SPR_M32: #define __AVX512FP16__ 1 // CHECK_SPR_M32: #define __AVX512F__ 1 // CHECK_SPR_M32: #define __AVX512IFMA__ 1 // CHECK_SPR_M32: #define __AVX512VBMI2__ 1 @@ -1727,6 +1728,7 @@ // CHECK_SPR_M64: #define __AVX512BW__ 1 // CHECK_SPR_M64: #define __AVX512CD__ 1 // CHECK_SPR_M64: #define __AVX512DQ__ 1 +// CHECK_SPR_M64: #define __AVX512FP16__ 1 // CHECK_SPR_M64: #define __AVX512F__ 1 // CHECK_SPR_M64: #define __AVX512IFMA__ 1 // CHECK_SPR_M64: #define __AVX512VBMI2__ 1 diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -558,3 +558,25 @@ // AVXVNNINOAVX2-NOT: #define __AVX2__ 1 // AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16 %s + +// AVX512FP16: #define __AVX512BW__ 1 +// AVX512FP16: #define __AVX512DQ__ 1 +// AVX512FP16: #define __AVX512FP16__ 1 +// AVX512FP16: #define __AVX512VL__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s + +// AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1 +// AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s + +// AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1 +// AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1 + +// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s + +// AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1 +// AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1 diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -96,6 +96,8 @@ During this release ... +* Support for ``AVX512-FP16`` instructions has been added. + Changes to the AMDGPU Target ----------------------------- diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -312,6 +312,8 @@ def llvm_v2f16_ty : LLVMType; // 2 x half (__fp16) def llvm_v4f16_ty : LLVMType; // 4 x half (__fp16) def llvm_v8f16_ty : LLVMType; // 8 x half (__fp16) +def llvm_v16f16_ty : LLVMType; // 16 x half (__fp16) +def llvm_v32f16_ty : LLVMType; // 32 x half (__fp16) def llvm_v2bf16_ty : LLVMType; // 2 x bfloat (__bf16) def llvm_v4bf16_ty : LLVMType; // 4 x bfloat (__bf16) def llvm_v8bf16_ty : LLVMType; // 8 x bfloat (__bf16) diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -31,6 +31,8 @@ #define XOP9_MAP_SYM x86DisassemblerXOP9Opcodes #define XOPA_MAP_SYM x86DisassemblerXOPAOpcodes #define THREEDNOW_MAP_SYM x86Disassembler3DNowOpcodes +#define MAP5_SYM x86DisassemblerMap5Opcodes +#define MAP6_SYM x86DisassemblerMap6Opcodes #define INSTRUCTIONS_STR "x86DisassemblerInstrSpecifiers" #define CONTEXTS_STR "x86DisassemblerContexts" @@ -42,6 +44,8 @@ #define XOP9_MAP_STR "x86DisassemblerXOP9Opcodes" #define XOPA_MAP_STR "x86DisassemblerXOPAOpcodes" #define THREEDNOW_MAP_STR "x86Disassembler3DNowOpcodes" +#define MAP5_STR "x86DisassemblerMap5Opcodes" +#define MAP6_STR "x86DisassemblerMap6Opcodes" // Attributes of an instruction that must be known before the opcode can be // processed correctly. Most of these indicate the presence of particular @@ -292,7 +296,9 @@ XOP8_MAP = 4, XOP9_MAP = 5, XOPA_MAP = 6, - THREEDNOW_MAP = 7 + THREEDNOW_MAP = 7, + MAP5 = 8, + MAP6 = 9 }; // The following structs are used for the hierarchical decode table. After diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -193,6 +193,7 @@ X86_FEATURE (XSAVEOPT, "xsaveopt") X86_FEATURE (XSAVES, "xsaves") X86_FEATURE (HRESET, "hreset") +X86_FEATURE (AVX512FP16, "avx512fp16") X86_FEATURE (AVXVNNI, "avxvnni") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -1050,6 +1050,10 @@ let IsLoad = true; let ScalarMemoryVT = i32; } +def extloadvf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { + let IsLoad = true; + let ScalarMemoryVT = f16; +} def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { let IsLoad = true; let ScalarMemoryVT = f32; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -820,6 +820,7 @@ case ISD::BITCAST: Res = SoftenFloatOp_BITCAST(N); break; case ISD::BR_CC: Res = SoftenFloatOp_BR_CC(N); break; + case ISD::STRICT_FP_TO_FP16: case ISD::FP_TO_FP16: // Same as FP_ROUND for softening purposes case ISD::STRICT_FP_ROUND: case ISD::FP_ROUND: Res = SoftenFloatOp_FP_ROUND(N); break; @@ -871,13 +872,17 @@ // We actually deal with the partially-softened FP_TO_FP16 node too, which // returns an i16 so doesn't meet the constraints necessary for FP_ROUND. assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 || + N->getOpcode() == ISD::STRICT_FP_TO_FP16 || N->getOpcode() == ISD::STRICT_FP_ROUND); bool IsStrict = N->isStrictFPOpcode(); SDValue Op = N->getOperand(IsStrict ? 1 : 0); EVT SVT = Op.getValueType(); EVT RVT = N->getValueType(0); - EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT; + EVT FloatRVT = (N->getOpcode() == ISD::FP_TO_FP16 || + N->getOpcode() == ISD::STRICT_FP_TO_FP16) + ? MVT::f16 + : RVT; RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall"); diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1633,6 +1633,7 @@ // For more info, see X86 ISA docs. Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1); Features["amx-bf16"] = HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave; + Features["avx512fp16"] = HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save; Features["amx-tile"] = HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave; Features["amx-int8"] = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave; bool HasLeaf7Subleaf1 = diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -201,11 +201,11 @@ FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B | FeatureCLWB | FeatureMOVDIRI | FeatureSHSTK | FeatureKL | FeatureWIDEKL; constexpr FeatureBitset FeaturesSapphireRapids = - FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 | - FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE | - FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE | - FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR | - FeatureWAITPKG | FeatureAVXVNNI; + FeaturesICLServer | FeatureAMX_BF16 | FeatureAMX_INT8 | FeatureAMX_TILE | + FeatureAVX512BF16 | FeatureAVX512FP16 | FeatureAVX512VP2INTERSECT | + FeatureAVXVNNI | FeatureCLDEMOTE | FeatureENQCMD | FeatureMOVDIR64B | + FeatureMOVDIRI | FeaturePTWRITE | FeatureSERIALIZE | FeatureSHSTK | + FeatureTSXLDTRK | FeatureUINTR | FeatureWAITPKG; // Intel Atom processors. // Bonnell has feature parity with Core2 and adds MOVBE. @@ -576,6 +576,8 @@ constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; +static constexpr FeatureBitset ImpliedFeaturesAVX512FP16 = + FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL; // Key Locker Features constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2; constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL; diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2753,6 +2753,7 @@ .Case("1to4", "{1to4}") .Case("1to8", "{1to8}") .Case("1to16", "{1to16}") + .Case("1to32", "{1to32}") .Default(nullptr); if (!BroadcastPrimitive) return TokError("Invalid memory broadcast primitive."); diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -150,6 +150,12 @@ dec = &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; break; + case MAP5: + dec = &MAP5_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; + case MAP6: + dec = &MAP6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; + break; } switch (dec->modrm_type) { @@ -332,7 +338,7 @@ } if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) && - ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) { + ((~byte1 & 0x8) == 0x8) && ((byte2 & 0x4) == 0x4)) { insn->vectorExtensionType = TYPE_EVEX; } else { --insn->readerCursor; // unconsume byte1 @@ -876,11 +882,11 @@ insn->opcodeType = ONEBYTE; if (insn->vectorExtensionType == TYPE_EVEX) { - switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { + switch (mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])) { default: LLVM_DEBUG( - dbgs() << format("Unhandled mm field for instruction (0x%hhx)", - mmFromEVEX2of4(insn->vectorExtensionPrefix[1]))); + dbgs() << format("Unhandled mmm field for instruction (0x%hhx)", + mmmFromEVEX2of4(insn->vectorExtensionPrefix[1]))); return true; case VEX_LOB_0F: insn->opcodeType = TWOBYTE; @@ -891,6 +897,12 @@ case VEX_LOB_0F3A: insn->opcodeType = THREEBYTE_3A; return consume(insn, insn->opcode); + case VEX_LOB_MAP5: + insn->opcodeType = MAP5; + return consume(insn, insn->opcode); + case VEX_LOB_MAP6: + insn->opcodeType = MAP6; + return consume(insn, insn->opcode); } } else if (insn->vectorExtensionType == TYPE_VEX_3B) { switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) { @@ -908,6 +920,12 @@ case VEX_LOB_0F3A: insn->opcodeType = THREEBYTE_3A; return consume(insn, insn->opcode); + case VEX_LOB_MAP5: + insn->opcodeType = MAP5; + return consume(insn, insn->opcode); + case VEX_LOB_MAP6: + insn->opcodeType = MAP6; + return consume(insn, insn->opcode); } } else if (insn->vectorExtensionType == TYPE_VEX_2B) { insn->opcodeType = TWOBYTE; @@ -1043,6 +1061,12 @@ case THREEDNOW_MAP: decision = &THREEDNOW_MAP_SYM; break; + case MAP5: + decision = &MAP5_SYM; + break; + case MAP6: + decision = &MAP6_SYM; + break; } if (decision->opcodeDecisions[insnCtx] diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -37,7 +37,7 @@ #define xFromEVEX2of4(evex) (((~(evex)) & 0x40) >> 6) #define bFromEVEX2of4(evex) (((~(evex)) & 0x20) >> 5) #define r2FromEVEX2of4(evex) (((~(evex)) & 0x10) >> 4) -#define mmFromEVEX2of4(evex) ((evex) & 0x3) +#define mmmFromEVEX2of4(evex) ((evex) & 0x7) #define wFromEVEX3of4(evex) (((evex) & 0x80) >> 7) #define vvvvFromEVEX3of4(evex) (((~(evex)) & 0x78) >> 3) #define ppFromEVEX3of4(evex) ((evex) & 0x3) @@ -489,7 +489,9 @@ enum VEXLeadingOpcodeByte { VEX_LOB_0F = 0x1, VEX_LOB_0F38 = 0x2, - VEX_LOB_0F3A = 0x3 + VEX_LOB_0F3A = 0x3, + VEX_LOB_MAP5 = 0x5, + VEX_LOB_MAP6 = 0x6 }; enum XOPMapSelect { diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h --- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -790,7 +790,7 @@ // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc. // OpMapShift = OpPrefixShift + 2, - OpMapMask = 0x7 << OpMapShift, + OpMapMask = 0xF << OpMapShift, // OB - OneByte - Set if this instruction has a one byte opcode. OB = 0 << OpMapShift, @@ -819,13 +819,17 @@ /// this flag to indicate that the encoder should do the wacky 3DNow! thing. ThreeDNow = 7 << OpMapShift, + // MAP5, MAP6 - Prefix after the 0x0F prefix. + T_MAP5 = 8 << OpMapShift, + T_MAP6 = 9 << OpMapShift, + //===------------------------------------------------------------------===// // REX_W - REX prefixes are instruction prefixes used in 64-bit mode. // They are used to specify GPRs and SSE registers, 64-bit operand size, // etc. We only cares about REX.W and REX.R bits and only the former is // statically determined. // - REXShift = OpMapShift + 3, + REXShift = OpMapShift + 4, REX_W = 1 << REXShift, //===------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -799,7 +799,10 @@ // 0b00001: implied 0F leading opcode // 0b00010: implied 0F 38 leading opcode bytes // 0b00011: implied 0F 3A leading opcode bytes - // 0b00100-0b11111: Reserved for future use + // 0b00100: Reserved for future use + // 0b00101: VEX MAP5 + // 0b00110: VEX MAP6 + // 0b00111-0b11111: Reserved for future use // 0b01000: XOP map select - 08h instructions with imm byte // 0b01001: XOP map select - 09h instructions with no imm byte // 0b01010: XOP map select - 0Ah instructions with imm dword @@ -825,6 +828,12 @@ case X86II::XOPA: VEX_5M = 0xA; break; + case X86II::T_MAP5: + VEX_5M = 0x5; + break; + case X86II::T_MAP6: + VEX_5M = 0x6; + break; } // VEX_4V (VEX vvvv field): a register specifier @@ -1173,10 +1182,10 @@ // EVEX opcode prefix can have 4 bytes // // +-----+ +--------------+ +-------------------+ +------------------------+ - // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | + // | 62h | | RXBR' | 0mmm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa | // +-----+ +--------------+ +-------------------+ +------------------------+ - assert((VEX_5M & 0x3) == VEX_5M && - "More than 2 significant bits in VEX.m-mmmm fields for EVEX!"); + assert((VEX_5M & 0x7) == VEX_5M && + "More than 3 significant bits in VEX.m-mmmm fields for EVEX!"); emitByte(0x62, OS); emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) | diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -184,6 +184,14 @@ "HasVP2INTERSECT", "true", "Enable AVX-512 vp2intersect", [FeatureAVX512]>; +// FIXME: FP16 scalar intrinsics use the type v8f16, which is supposed to be +// guarded under condition hasVLX. So we imply it in FeatureFP16 currently. +// FIXME: FP16 conversion between f16 and i64 customize type v8i64, which is +// supposed to be guarded under condition hasDQI. So we imply it in FeatureFP16 +// currently. +def FeatureFP16 : SubtargetFeature<"avx512fp16", "HasFP16", "true", + "Support 16-bit floating point", + [FeatureBWI, FeatureVLX, FeatureDQI]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; @@ -791,6 +799,7 @@ FeatureCLDEMOTE, FeatureWAITPKG, FeaturePTWRITE, + FeatureFP16, FeatureAVXVNNI, FeatureTSXLDTRK, FeatureENQCMD, diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -233,19 +233,19 @@ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // MMX vector types are always returned in MM0. If the target doesn't have @@ -267,6 +267,7 @@ CCIfInReg>>>, CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>, + CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>, CCDelegateTo ]>; @@ -329,6 +330,7 @@ // X86-64 C return-value convention. def RetCC_X86_64_C : CallingConv<[ // The X86-64 calling convention always returns FP values in XMM0. + CCIfType<[f16], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>, CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>, @@ -552,7 +554,7 @@ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -561,33 +563,33 @@ // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. - CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. - CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>, // Long doubles get stack slots whose size and alignment depends on the // subtarget. CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -635,13 +637,13 @@ CCIfCFGuardTarget>, // 128 bit vectors are passed by pointer - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect>, + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -655,7 +657,7 @@ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType>>, // The first 4 FP/Vector arguments are passed in XMM registers. - CCIfType<[f32, f64], + CCIfType<[f16, f32, f64], CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3], [RCX , RDX , R8 , R9 ]>>, @@ -678,7 +680,7 @@ // Integer/FP values get stored in stack slots that are 8 bytes in size and // 8-byte aligned if there are no more registers to hold them. - CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>> + CCIfType<[i8, i16, i32, i64, f16, f32, f64], CCAssignToStack<8, 8>> ]>; def CC_X86_Win64_VectorCall : CallingConv<[ @@ -757,14 +759,15 @@ /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -772,16 +775,16 @@ // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo @@ -791,16 +794,16 @@ // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo @@ -819,11 +822,15 @@ CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>, + CCIfNotVarArg>>>, + // The first 3 __m64 vector arguments are passed in mmx registers if the // call is not a vararg call. CCIfNotVarArg>>, + CCIfType<[f16], CCAssignToStack<4, 4>>, + // Integer/Float values get stored in stack slots that are 4 bytes in // size and 4-byte aligned. CCIfType<[i32, f32], CCAssignToStack<4, 4>>, diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -55,6 +55,7 @@ /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf64; bool X86ScalarSSEf32; + bool X86ScalarSSEf16; public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, @@ -63,6 +64,7 @@ Subtarget = &funcInfo.MF->getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); + X86ScalarSSEf16 = Subtarget->hasFP16(); } bool fastSelectInstruction(const Instruction *I) override; @@ -157,7 +159,8 @@ /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 + (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); @@ -2283,9 +2286,10 @@ unsigned Opc; switch (RetVT.SimpleTy) { default: return false; - case MVT::i8: Opc = X86::CMOV_GR8; break; - case MVT::i16: Opc = X86::CMOV_GR16; break; - case MVT::i32: Opc = X86::CMOV_GR32; break; + case MVT::i8: Opc = X86::CMOV_GR8; break; + case MVT::i16: Opc = X86::CMOV_GR16; break; + case MVT::f16: Opc = X86::CMOV_FR16X; break; + case MVT::i32: Opc = X86::CMOV_GR32; break; case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X : X86::CMOV_FR32; break; case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1121,7 +1121,10 @@ if (VT.isVector() || VT == MVT::f128) break; - MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; + MVT VecVT = VT == MVT::f64 ? MVT::v2f64 + : VT == MVT::f32 ? MVT::v4f32 + : MVT::v8f16; + SDLoc dl(N); SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(0)); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -460,6 +460,7 @@ MOVHLPS, MOVSD, MOVSS, + MOVSH, UNPCKL, UNPCKH, VPERMILPV, @@ -998,7 +999,8 @@ bool isCtlzFast() const override; bool hasBitPreservingFPLogic(EVT VT) const override { - return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); + return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() || + (VT == MVT::f16 && X86ScalarSSEf16); } bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override { @@ -1282,7 +1284,8 @@ /// register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 - (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 + (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1 + (VT == MVT::f16 && X86ScalarSSEf16); // f16 is when AVX512FP16 } /// Returns true if it is beneficial to convert a load of a constant @@ -1442,6 +1445,7 @@ /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf32; bool X86ScalarSSEf64; + bool X86ScalarSSEf16; /// A list of legal FP immediates. std::vector LegalFPImmediates; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -117,6 +117,7 @@ bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); X86ScalarSSEf64 = Subtarget.hasSSE2(); X86ScalarSSEf32 = Subtarget.hasSSE1(); + X86ScalarSSEf16 = Subtarget.hasFP16(); MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); // Set up the TargetLowering object. @@ -1903,6 +1904,71 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { + auto setGroup = [&] (MVT VT) { + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + + setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + + setOperationAction(ISD::FNEG, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); + setOperationAction(ISD::FCOPYSIGN, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + }; + + // AVX512_FP16 scalar operations + setGroup(MVT::f16); + addRegisterClass(MVT::f16, &X86::FR16XRegClass); + + if (Subtarget.useAVX512Regs()) { + setGroup(MVT::v32f16); + addRegisterClass(MVT::v32f16, &X86::VR512RegClass); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); + + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); + } + + if (Subtarget.hasVLX()) { + addRegisterClass(MVT::v8f16, &X86::VR128XRegClass); + addRegisterClass(MVT::v16f16, &X86::VR256XRegClass); + setGroup(MVT::v8f16); + setGroup(MVT::v16f16); + + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); + + // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); + + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal); + + // Need to custom widen these to prevent scalarization. + setOperationAction(ISD::LOAD, MVT::v4f16, Custom); + setOperationAction(ISD::STORE, MVT::v4f16, Custom); + } + + // Support fp16 0 immediate + addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); @@ -2165,6 +2231,11 @@ return RegisterVT; } + // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. + // So its default register type is f16. We override the type to v8f16 here. + if (VT == MVT::v3f16 && Subtarget.hasFP16()) + return MVT::v8f16; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -2183,6 +2254,11 @@ return NumRegisters; } + // v3f16 will be widen to v4f16. But we don't assign register class for v4f16. + // So its default register number is 3. We override the number to 1 here. + if (VT == MVT::v3f16 && Subtarget.hasFP16()) + return 1; + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -3670,6 +3746,8 @@ RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; + else if (RegVT == MVT::f16) + RC = &X86::FR16XRegClass; else if (RegVT == MVT::f32) RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; else if (RegVT == MVT::f64) @@ -4872,6 +4950,7 @@ case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::MOVSH: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VBROADCAST: @@ -7148,6 +7227,7 @@ break; case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::MOVSH: assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); @@ -8685,7 +8765,8 @@ // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. if (IsConsecutiveLoad && FirstLoadedElt == 0 && - (LoadSizeInBits == 32 || LoadSizeInBits == 64) && + ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 || + LoadSizeInBits == 64) && ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) : MVT::getIntegerVT(LoadSizeInBits); @@ -9009,6 +9090,7 @@ // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || + (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) || (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast(Ld)) @@ -9071,6 +9153,9 @@ return BCast; } + if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256) + return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); + // Unsupported broadcast. return SDValue(); } @@ -10471,13 +10556,15 @@ if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || - (EltVT == MVT::i64 && Subtarget.is64Bit())) { + if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 || + EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) || + (EltVT == MVT::i16 && Subtarget.hasFP16())) { assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); - // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. + // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a + // zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } @@ -10607,7 +10694,7 @@ DAG, Subtarget)) return V; - if (EVTBits == 16 && NumElems == 8) + if (EltVT == MVT::i16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero, DAG, Subtarget)) return V; @@ -10664,7 +10751,7 @@ return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. - if (Subtarget.hasSSE41()) { + if (Subtarget.hasSSE41() && EltVT != MVT::f16) { SDValue Result; if (!Op.getOperand(0).isUndef()) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); @@ -13462,7 +13549,7 @@ if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); - if (EltVT == MVT::i8 || EltVT == MVT::i16) { + if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) @@ -13494,11 +13581,17 @@ if (!VT.is128BitVector()) return SDValue(); - // Otherwise, use MOVSD or MOVSS. - assert((EltVT == MVT::f32 || EltVT == MVT::f64) && - "Only two types of floating point element types to handle!"); - return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, - ExtVT, V1, V2); + // Otherwise, use MOVSD, MOVSS or MOVSH. + unsigned MovOpc = 0; + if (EltVT == MVT::f16) + MovOpc = X86ISD::MOVSH; + else if (EltVT == MVT::f32) + MovOpc = X86ISD::MOVSS; + else if (EltVT == MVT::f64) + MovOpc = X86ISD::MOVSD; + else + llvm_unreachable("Unsupported floating point element type to handle!"); + return DAG.getNode(MovOpc, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. @@ -15300,6 +15393,33 @@ Mask, Subtarget, DAG); } +/// Lower 8-lane 16-bit floating point shuffles. +static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef Mask, + const APInt &Zeroable, SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!"); + assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!"); + assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); + int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); + + if (NumV2Elements == 0) { + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + } + if (NumV2Elements == 1 && Mask[0] >= 8) + if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return V; + + V1 = DAG.getBitcast(MVT::v8i16, V1); + V2 = DAG.getBitcast(MVT::v8i16, V2); + return DAG.getBitcast(MVT::v8f16, + DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); +} + // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets, // sub-512-bit shuffles are padded to 512-bits for the shuffle and then // the active subvector is extracted. @@ -15705,6 +15825,8 @@ return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); + case MVT::v8f16: + return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); @@ -17569,6 +17691,13 @@ return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); } + if (VT == MVT::v16f16) { + V1 = DAG.getBitcast(MVT::v16i16, V1); + V2 = DAG.getBitcast(MVT::v16i16, V2); + return DAG.getBitcast(MVT::v16f16, + DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask)); + } + switch (VT.SimpleTy) { case MVT::v4f64: return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); @@ -18135,6 +18264,13 @@ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG); } + if (VT == MVT::v32f16) { + V1 = DAG.getBitcast(MVT::v32i16, V1); + V2 = DAG.getBitcast(MVT::v32i16, V2); + return DAG.getBitcast(MVT::v32f16, + DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask)); + } + // Dispatch to each element type for lowering. If we don't have support for // specific element type shuffles at 512 bits, immediately split them and // lower them. Each lowering routine of a given type is allowed to assume that @@ -18840,14 +18976,18 @@ MVT VT = Op.getSimpleValueType(); - if (VT.getSizeInBits() == 16) { + if (VT == MVT::i16) { // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless // we're going to zero extend the register or fold the store (SSE41 only). if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) && - !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) + !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) { + if (Subtarget.hasFP16()) + return Op; + return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, DAG.getBitcast(MVT::v4i32, Vec), Idx)); + } SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, DAG.getTargetConstant(IdxVal, dl, MVT::i8)); @@ -18886,12 +19026,13 @@ return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); } - if (VT.getSizeInBits() == 32) { + if (VT == MVT::f16 || VT.getSizeInBits() == 32) { if (IdxVal == 0) return Op; - // SHUFPS the element to the lowest double word, then movss. - int Mask[4] = { static_cast(IdxVal), -1, -1, -1 }; + // Shuffle the element to the lowest element, then movss or movsh. + SmallVector Mask(VecVT.getVectorNumElements(), -1); + Mask[0] = static_cast(IdxVal); Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, DAG.getIntPtrConstant(0, dl)); @@ -19041,10 +19182,10 @@ } assert(VT.is128BitVector() && "Only 128-bit vector types should be left!"); - // This will be just movd/movq/movss/movsd. + // This will be just movw/movd/movq/movsh/movss/movsd. if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) { if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || - EltVT == MVT::i64) { + EltVT == MVT::f16 || EltVT == MVT::i64) { N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); } @@ -19143,8 +19284,9 @@ assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"); - // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen. - if (OpVT == MVT::v4i32) + // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in + // tblgen. + if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16())) return Op; SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); @@ -22026,9 +22168,8 @@ MVT VT = Op.getSimpleValueType(); bool IsF128 = (VT == MVT::f128); - assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || - VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && + assert(VT.isFloatingPoint() && VT != MVT::f80 && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Unexpected type in LowerFABSorFNEG"); // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to @@ -22042,7 +22183,9 @@ bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) - LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + LogicVT = (VT == MVT::f64) ? MVT::v2f64 + : (VT == MVT::f32) ? MVT::v4f32 + : MVT::v8f16; unsigned EltBits = VT.getScalarSizeInBits(); // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... @@ -22087,9 +22230,8 @@ // At this point the operands and the result should have the same // type, and that won't be f80 since that is not custom lowered. bool IsF128 = (VT == MVT::f128); - assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 || - VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 || - VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) && + assert(VT.isFloatingPoint() && VT != MVT::f80 && + DAG.getTargetLoweringInfo().isTypeLegal(VT) && "Unexpected type in LowerFCOPYSIGN"); const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); @@ -22102,7 +22244,9 @@ bool IsFakeVector = !VT.isVector() && !IsF128; MVT LogicVT = VT; if (IsFakeVector) - LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32; + LogicVT = (VT == MVT::f64) ? MVT::v2f64 + : (VT == MVT::f32) ? MVT::v4f32 + : MVT::v8f16; // The mask constants are automatically splatted for vector types. unsigned EltSizeInBits = VT.getScalarSizeInBits(); @@ -23037,7 +23181,7 @@ if (isFP) { #ifndef NDEBUG MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); - assert(EltVT == MVT::f32 || EltVT == MVT::f64); + assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64); #endif bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; @@ -23051,7 +23195,10 @@ if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 && (!IsStrict || Subtarget.hasVLX() || Op0.getSimpleValueType().is512BitVector())) { - assert(VT.getVectorNumElements() <= 16); +#ifndef NDEBUG + unsigned Num = VT.getVectorNumElements(); + assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16)); +#endif Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; } else { Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; @@ -31001,8 +31148,9 @@ assert(!VT.isVector() && "Vectors should have been handled above!"); - if (Subtarget.hasDQI() && VT == MVT::i64 && - (SrcVT == MVT::f32 || SrcVT == MVT::f64)) { + if ((Subtarget.hasDQI() && VT == MVT::i64 && + (SrcVT == MVT::f32 || SrcVT == MVT::f64)) || + (Subtarget.hasFP16() && SrcVT == MVT::f16)) { assert(!Subtarget.is64Bit() && "i64 should be legal"); unsigned NumElts = Subtarget.hasVLX() ? 2 : 8; // If we use a 128-bit result we might need to use a target specific node. @@ -31656,6 +31804,7 @@ NODE_NAME_CASE(MOVSLDUP) NODE_NAME_CASE(MOVSD) NODE_NAME_CASE(MOVSS) + NODE_NAME_CASE(MOVSH) NODE_NAME_CASE(UNPCKL) NODE_NAME_CASE(UNPCKH) NODE_NAME_CASE(VBROADCAST) @@ -32576,6 +32725,7 @@ // conditional jump around it. static bool isCMOVPseudo(MachineInstr &MI) { switch (MI.getOpcode()) { + case X86::CMOV_FR16X: case X86::CMOV_FR32: case X86::CMOV_FR32X: case X86::CMOV_FR64: @@ -35197,17 +35347,15 @@ unsigned NumMaskElts = Mask.size(); unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); - // Match against a VZEXT_MOVL vXi32 zero-extending instruction. - if (MaskEltSize == 32 && Mask[0] == 0) { - if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) { + // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction. + if (Mask[0] == 0 && + (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) { + if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) || + (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) { Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; - return true; - } - if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && - isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { - Shuffle = X86ISD::VZEXT_MOVL; - SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; + SrcVT = DstVT = + !Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT; return true; } } @@ -35501,6 +35649,12 @@ SrcVT = DstVT = MVT::v4f32; return true; } + if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) && + Subtarget.hasFP16()) { + Shuffle = X86ISD::MOVSH; + SrcVT = DstVT = MVT::v8f16; + return true; + } } // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle. @@ -36931,6 +37085,10 @@ if (!VT.isVector() || !VT.isSimple()) return SDValue(); // Bail if we hit a non-simple non-vector. + // FIXME: Just bail on f16 for now. + if (VT.getVectorElementType() == MVT::f16) + return SDValue(); + assert((RootSizeInBits % VT.getSizeInBits()) == 0 && "Can only combine shuffles upto size of the root op."); @@ -38057,6 +38215,7 @@ assert(Mask.size() == 4); break; case X86ISD::MOVSD: + case X86ISD::MOVSH: case X86ISD::MOVSS: { SDValue N0 = N.getOperand(0); SDValue N1 = N.getOperand(1); @@ -38441,6 +38600,12 @@ if (VT.is512BitVector()) return SDValue(); + // Do not generate X86ISD::ADDSUB node for FP16's vector types even though + // the ADDSUB idiom has been successfully recognized. There are no known + // X86 targets with FP16 ADDSUB instructions! + if (VT.getVectorElementType() == MVT::f16) + return SDValue(); + return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } @@ -40419,6 +40584,7 @@ // Check if we have a bitcast from another integer type as well. if (!((Subtarget.hasSSE1() && VT == MVT::f32) || (Subtarget.hasSSE2() && VT == MVT::f64) || + (Subtarget.hasFP16() && VT == MVT::f16) || (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() && TLI.isTypeLegal(VT)))) return SDValue(); @@ -40994,7 +41160,8 @@ /// Extracting a scalar FP value from vector element 0 is free, so extract each /// operand first, then perform the math as a scalar op. -static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) { +static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract"); SDValue Vec = ExtElt->getOperand(0); SDValue Index = ExtElt->getOperand(1); @@ -41022,7 +41189,8 @@ return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); } - if (VT != MVT::f32 && VT != MVT::f64) + if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 && + VT != MVT::f64) return SDValue(); // Vector FP selects don't fit the pattern of FP math ops (because the @@ -41336,7 +41504,7 @@ if (SDValue V = combineArithReduction(N, DAG, Subtarget)) return V; - if (SDValue V = scalarizeExtEltFP(N, DAG)) + if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget)) return V; // Attempt to extract a i1 element by using MOVMSK to extract the signbits @@ -44315,7 +44483,8 @@ SDValue CMP01 = CMP0->getOperand(1); EVT VT = CMP00.getValueType(); - if (VT == MVT::f32 || VT == MVT::f64) { + if (VT == MVT::f32 || VT == MVT::f64 || + (VT == MVT::f16 && Subtarget.hasFP16())) { bool ExpectingFlags = false; // Check for any users that want flags: for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); @@ -44562,9 +44731,9 @@ EVT N10Type = N10.getValueType(); // Ensure that both types are the same and are legal scalar fp types. - if (N00Type != N10Type || - !((Subtarget.hasSSE1() && N00Type == MVT::f32) || - (Subtarget.hasSSE2() && N00Type == MVT::f64))) + if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) || + (Subtarget.hasSSE2() && N00Type == MVT::f64) || + (Subtarget.hasFP16() && N00Type == MVT::f16))) return SDValue(); unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); @@ -51312,6 +51481,7 @@ case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::MOVSH: case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: @@ -52064,7 +52234,8 @@ /// Check if \p RC is a vector register class. /// I.e., FR* / VR* or one of their variant. static bool isFRClass(const TargetRegisterClass &RC) { - return RC.hasSuperClassEq(&X86::FR32XRegClass) || + return RC.hasSuperClassEq(&X86::FR16XRegClass) || + RC.hasSuperClassEq(&X86::FR32XRegClass) || RC.hasSuperClassEq(&X86::FR64XRegClass) || RC.hasSuperClassEq(&X86::VR128XRegClass) || RC.hasSuperClassEq(&X86::VR256XRegClass) || diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -44,8 +44,9 @@ // It is a little bit complex for scalar types, where NumElts = 1. // In this case we build v4f32 or v2f64 string VTName = "v" # !if (!eq (NumElts, 1), + !if (!eq (EltVT.Size, 16), 8, !if (!eq (EltVT.Size, 32), 4, - !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT; + !if (!eq (EltVT.Size, 64), 2, NumElts))), NumElts) # EltVT; // The vector VT. ValueType VT = !cast(VTName); @@ -65,8 +66,9 @@ X86MemOperand MemOp = !cast(TypeVariantName # Size # "mem"); X86MemOperand ScalarMemOp = !cast(EltVT # "mem"); // FP scalar memory operand for intrinsics - ssmem/sdmem. - Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast("ssmem"), - !if (!eq (EltTypeName, "f64"), !cast("sdmem"), ?)); + Operand IntScalarMemOp = !if (!eq (EltTypeName, "f16"), !cast("shmem"), + !if (!eq (EltTypeName, "f32"), !cast("ssmem"), + !if (!eq (EltTypeName, "f64"), !cast("sdmem"), ?))); // Load patterns PatFrag LdFrag = !cast("load" # VTName); @@ -76,11 +78,9 @@ PatFrag ScalarLdFrag = !cast("load" # EltVT); PatFrag BroadcastLdFrag = !cast("X86VBroadcastld" # EltSizeName); - PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"), - !cast("sse_load_f32"), - !if (!eq (EltTypeName, "f64"), - !cast("sse_load_f64"), - ?)); + PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f16"), !cast("sse_load_f16"), + !if (!eq (EltTypeName, "f32"), !cast("sse_load_f32"), + !if (!eq (EltTypeName, "f64"), !cast("sse_load_f64"), ?))); // The string to specify embedded broadcast in assembly. string BroadcastStr = "{1to" # NumElts # "}"; @@ -95,9 +95,12 @@ Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle, !if (!eq (EltTypeName, "f64"), SSEPackedDouble, - SSEPackedInt)); + !if (!eq (EltTypeName, "f16"), SSEPackedSingle, // FIXME? + SSEPackedInt))); - RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X); + RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, + !if (!eq (EltTypeName, "f16"), FR16X, + FR64X)); dag ImmAllZerosV = (VT immAllZerosV); @@ -109,6 +112,7 @@ def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">; def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">; def v8i64_info : X86VectorVTInfo<8, i64, VR512, "q">; +def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">; def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">; def v8f64_info : X86VectorVTInfo<8, f64, VR512, "pd">; @@ -117,6 +121,7 @@ def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">; def v8i32x_info : X86VectorVTInfo<8, i32, VR256X, "d">; def v4i64x_info : X86VectorVTInfo<4, i64, VR256X, "q">; +def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">; def v8f32x_info : X86VectorVTInfo<8, f32, VR256X, "ps">; def v4f64x_info : X86VectorVTInfo<4, f64, VR256X, "pd">; @@ -124,6 +129,7 @@ def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">; def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">; def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">; +def v8f16x_info : X86VectorVTInfo<8, f16, VR128X, "ph">; def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">; def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">; @@ -131,6 +137,7 @@ // with the appropriate element type. This allows to use the same masking logic. def i32x_info : X86VectorVTInfo<1, i32, GR32, "si">; def i64x_info : X86VectorVTInfo<1, i64, GR64, "sq">; +def f16x_info : X86VectorVTInfo<1, f16, VR128X, "sh">; def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">; def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">; @@ -149,6 +156,8 @@ v4i32x_info>; def avx512vl_i64_info : AVX512VLVectorVTInfo; +def avx512vl_f16_info : AVX512VLVectorVTInfo; def avx512vl_f32_info : AVX512VLVectorVTInfo; def avx512vl_f64_info : AVX512VLVectorVTInfo; } +let Predicates = [HasFP16] in { +def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>; +def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>; +def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>; +} + // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, @@ -513,6 +528,12 @@ [(set VR128X:$dst, fp128imm0)]>; } +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in { + def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "", + [(set FR16X:$dst, fp16imm0)]>; +} + //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // @@ -649,16 +670,22 @@ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>; // Codegen pattern with the alternative types insert VEC128 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info, vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info, + vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>; // Codegen pattern with the alternative types insert VEC256 into VEC512 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info, + vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>; multiclass vinsert_for_mask_cast; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info, vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>; // Codegen pattern with the alternative types extract VEC128 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info, + vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>; // Codegen pattern with the alternative types extract VEC256 from VEC512 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info, + vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>; // A 128-bit extract from bits [255:128] of a 512-bit vector should use a @@ -1015,6 +1048,12 @@ (iPTR 1)))>; } +let Predicates = [HasFP16, HasVLX] in +def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))), + (v8f16 (VEXTRACTF32x4Z256rr + (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)), + (iPTR 1)))>; + // Additional patterns for handling a bitcast between the vselect and the // extract_subvector. @@ -1439,6 +1478,31 @@ Sched<[SchedWriteShuffle.YMM.Folded]>, AVX5128IBase, EVEX; } +let Predicates = [HasFP16] in { + def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)), + (VPBROADCASTWZrm addr:$src)>; + + def : Pat<(v32f16 (X86VBroadcast (v8f16 VR128X:$src))), + (VPBROADCASTWZrr VR128X:$src)>; + def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))), + (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>; +} +let Predicates = [HasVLX, HasFP16] in { + def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)), + (VPBROADCASTWZ128rm addr:$src)>; + def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)), + (VPBROADCASTWZ256rm addr:$src)>; + + def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128X:$src))), + (VPBROADCASTWZ128rr VR128X:$src)>; + def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128X:$src))), + (VPBROADCASTWZ256rr VR128X:$src)>; + + def : Pat<(v8f16 (X86VBroadcast (f16 FR16X:$src))), + (VPBROADCASTWZ128rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>; + def : Pat<(v16f16 (X86VBroadcast (f16 FR16X:$src))), + (VPBROADCASTWZ256rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>; +} //===----------------------------------------------------------------------===// // AVX-512 BROADCAST SUBVECTORS @@ -1462,6 +1526,8 @@ (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v32f16 (X86SubVBroadcastld256 addr:$src)), + (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)), (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)), @@ -1475,6 +1541,8 @@ (VBROADCASTF32X4rm addr:$src)>; def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v32f16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4rm addr:$src)>; def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)), @@ -1532,6 +1600,8 @@ (VBROADCASTF32X4Z256rm addr:$src)>; def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)), + (VBROADCASTF32X4Z256rm addr:$src)>; def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)), (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)), @@ -3766,6 +3836,110 @@ def : Pat<(store (v32i8 VR256X:$src), addr:$dst), (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>; } +let Predicates = [HasFP16] in { + def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))), + (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>; + def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)), + (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>; + def : Pat<(v32f16 (alignedloadv32f16 addr:$src)), + (VMOVAPSZrm addr:$src)>; + def : Pat<(v32f16 (vselect VK32WM:$mask, + (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))), + (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (vselect VK32WM:$mask, + (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), + (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (loadv32f16 addr:$src)), + (VMOVUPSZrm addr:$src)>; + def : Pat<(v32f16 (vselect VK32WM:$mask, + (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))), + (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (vselect VK32WM:$mask, + (v32f16 (loadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)), + (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, (v32f16 VR512:$src0))), + (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, undef)), + (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; + def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)), + (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>; + + def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst), + (VMOVAPSZmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32f16 VR512:$src), addr:$dst), + (VMOVUPSZmr addr:$dst, VR512:$src)>; + def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask), + (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>; +} +let Predicates = [HasFP16, HasVLX] in { + def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))), + (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>; + def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)), + (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>; + def : Pat<(v16f16 (alignedloadv16f16 addr:$src)), + (VMOVAPSZ256rm addr:$src)>; + def : Pat<(v16f16 (vselect VK16WM:$mask, + (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))), + (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (vselect VK16WM:$mask, + (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), + (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (loadv16f16 addr:$src)), + (VMOVUPSZ256rm addr:$src)>; + def : Pat<(v16f16 (vselect VK16WM:$mask, + (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))), + (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (vselect VK16WM:$mask, + (v16f16 (loadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)), + (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, (v16f16 VR256X:$src0))), + (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, undef)), + (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; + def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)), + (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>; + + def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst), + (VMOVAPSZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(store (v16f16 VR256X:$src), addr:$dst), + (VMOVUPSZ256mr addr:$dst, VR256X:$src)>; + def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask), + (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>; + + def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), (v8f16 VR128X:$src0))), + (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>; + def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)), + (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>; + def : Pat<(v8f16 (alignedloadv8f16 addr:$src)), + (VMOVAPSZ128rm addr:$src)>; + def : Pat<(v8f16 (vselect VK8WM:$mask, + (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))), + (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (vselect VK8WM:$mask, + (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), + (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (loadv8f16 addr:$src)), + (VMOVUPSZ128rm addr:$src)>; + def : Pat<(v8f16 (vselect VK8WM:$mask, + (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))), + (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (vselect VK8WM:$mask, + (v8f16 (loadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)), + (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, (v8f16 VR128X:$src0))), + (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, undef)), + (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; + def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)), + (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>; + + def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst), + (VMOVAPSZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(store (v8f16 VR128X:$src), addr:$dst), + (VMOVUPSZ128mr addr:$dst, VR128X:$src)>; + def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask), + (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>; +} // Move Int Doubleword to Packed Double Int // @@ -3905,12 +4079,13 @@ (VMOV64toSDZrr (KMOVQrk VK64:$src))>; //===----------------------------------------------------------------------===// -// AVX-512 MOVSS, MOVSD +// AVX-512 MOVSH, MOVSS, MOVSD //===----------------------------------------------------------------------===// multiclass avx512_move_scalar { - let Predicates = [HasAVX512, OptForSize] in + X86VectorVTInfo _, + list prd = [HasAVX512, OptForSize]> { + let Predicates = prd in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3976,6 +4151,9 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; +defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info, + [HasFP16, OptForSize]>, + VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>; multiclass avx512_move_scalar_lowering { @@ -4144,9 +4322,14 @@ addr:$srcAddr)>; } +defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>; defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>; defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>; +defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4154,6 +4337,13 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; +defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (insert_subvector + (v32i1 immAllZerosV), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + GR8, sub_8bit>; defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4179,6 +4369,10 @@ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; +defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>; +defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>; defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>; defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, @@ -4186,6 +4380,13 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; +defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info, + (v32i1 (insert_subvector + (v32i1 immAllZerosV), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + (iPTR 0))), + (v8i1 (bitconvert (and GR8:$mask, (i8 1)))), + GR8, sub_8bit>; defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info, (v16i1 (insert_subvector (v16i1 immAllZerosV), @@ -4211,6 +4412,16 @@ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), (iPTR 0))), GR8, sub_8bit>; +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk + (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)), + VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; + +def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)), + (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)), + (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>; + def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)), @@ -4259,6 +4470,32 @@ (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>; let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { + let Predicates = [HasFP16] in { + def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins VR128X:$src1, VR128X:$src2), + "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}", + []>, T_MAP5XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSHZrr">, + Sched<[SchedWriteFShuffle.XMM]>; + + let Constraints = "$src0 = $dst" in + def VMOVSHZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f16x_info.RC:$src0, f16x_info.KRCWM:$mask, + VR128X:$src1, VR128X:$src2), + "vmovsh\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSHZrrk">, + Sched<[SchedWriteFShuffle.XMM]>; + + def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2), + "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSHZrrkz">, + Sched<[SchedWriteFShuffle.XMM]>; + } def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2), "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}", @@ -4311,6 +4548,16 @@ Sched<[SchedWriteFShuffle.XMM]>; } +def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (VMOVSHZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}}|"# + "$dst {${mask}}, $src1, $src2}", + (VMOVSHZrrk_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; +def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}} {z}|"# + "$dst {${mask}} {z}, $src1, $src2}", + (VMOVSHZrrkz_REV VR128X:$dst, VK1WM:$mask, + VR128X:$src1, VR128X:$src2), 0>; def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>; def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"# @@ -4393,6 +4640,29 @@ def : Pat<(v8f64 (X86vzload64 addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } +let Predicates = [HasFP16] in { + def : Pat<(v8f16 (X86vzmovl (v8f16 VR128X:$src))), + (VMOVSHZrr (v8f16 (AVX512_128_SET0)), VR128X:$src)>; + + // FIXME we need better canonicalization in dag combine + def : Pat<(v16f16 (X86vzmovl (v16f16 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)), + (v8f16 (EXTRACT_SUBREG (v16f16 VR256X:$src), sub_xmm)))), sub_xmm)>; + def : Pat<(v32f16 (X86vzmovl (v32f16 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)), + (v8f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_xmm)))), sub_xmm)>; + + def : Pat<(v8f16 (X86vzload16 addr:$src)), + (VMOVSHZrm addr:$src)>; + + def : Pat<(v16f16 (X86vzload16 addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>; + + def : Pat<(v32f16 (X86vzload16 addr:$src)), + (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>; +} let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in { def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst), @@ -12200,3 +12470,96 @@ defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA, avx512vl_f32_info, avx512vl_i32_info, HasBF16>, T8XS, EVEX_CD8<32, CD8VF>; + +//===----------------------------------------------------------------------===// +// AVX512FP16 +//===----------------------------------------------------------------------===// + +let Predicates = [HasFP16] in { +// Move word ( r/m16) to Packed word +def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveFromGpr]>; +def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src), + "vmovw\t{$src, $dst|$dst, $src}", + [(set VR128X:$dst, + (v8i16 (scalar_to_vector (loadi16 addr:$src))))]>, + T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>; + +def : Pat<(f16 (bitconvert GR16:$src)), + (f16 (COPY_TO_REGCLASS + (VMOVW2SHrr + (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), + FR16X))>; +def : Pat<(v8i16 (scalar_to_vector (i16 GR16:$src))), + (VMOVW2SHrr (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit))>; +def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (and GR32:$src, 0xffff)))), + (VMOVW2SHrr GR32:$src)>; +// FIXME: We should really find a way to improve these patterns. +def : Pat<(v8i32 (X86vzmovl + (insert_subvector undef, + (v4i32 (scalar_to_vector + (and GR32:$src, 0xffff))), + (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>; +def : Pat<(v16i32 (X86vzmovl + (insert_subvector undef, + (v4i32 (scalar_to_vector + (and GR32:$src, 0xffff))), + (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>; + +def : Pat<(v8i16 (X86vzmovl (v8i16 (scalar_to_vector (i16 (trunc GR32:$src)))))), + (VMOVW2SHrr GR32:$src)>; + +// AVX 128-bit movw instruction write zeros in the high 128-bit part. +def : Pat<(v8i16 (X86vzload16 addr:$src)), + (VMOVWrm addr:$src)>; +def : Pat<(v16i16 (X86vzload16 addr:$src)), + (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>; + +// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext. +def : Pat<(v32i16 (X86vzload16 addr:$src)), + (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>; + +def : Pat<(v4i32 (scalar_to_vector (i32 (extloadi16 addr:$src)))), + (VMOVWrm addr:$src)>; +def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (i32 (zextloadi16 addr:$src))))), + (VMOVWrm addr:$src)>; +def : Pat<(v8i32 (X86vzmovl + (insert_subvector undef, + (v4i32 (scalar_to_vector + (i32 (zextloadi16 addr:$src)))), + (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>; +def : Pat<(v16i32 (X86vzmovl + (insert_subvector undef, + (v4i32 (scalar_to_vector + (i32 (zextloadi16 addr:$src)))), + (iPTR 0)))), + (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>; + +// Move word from xmm register to r/m16 +def VMOVSH2Wrr : AVX512<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveToGpr]>; +def VMOVWmr : AVX512<0x7E, MRMDestMem, (outs), + (ins i16mem:$dst, VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", + [(store (i16 (extractelt (v8i16 VR128X:$src), + (iPTR 0))), addr:$dst)]>, + T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>; + +def : Pat<(i16 (bitconvert FR16X:$src)), + (i16 (EXTRACT_SUBREG + (VMOVSH2Wrr (COPY_TO_REGCLASS FR16X:$src, VR128X)), + sub_16bit))>; +def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))), + (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>; +} + +// Allow "vmovw" to use GR64 +let hasSideEffects = 0 in { + def VMOVW64toSHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>; + def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), + "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>; +} diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -555,6 +555,7 @@ let Predicates = [HasMMX] in defm _VR64 : CMOVrr_PSEUDO; + defm _FR16X : CMOVrr_PSEUDO; let Predicates = [HasSSE1,NoAVX512] in defm _FR32 : CMOVrr_PSEUDO; let Predicates = [HasSSE2,NoAVX512] in @@ -612,6 +613,8 @@ (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; + def : Pat<(v8f16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), + (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>; def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)), @@ -623,6 +626,8 @@ (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; + def : Pat<(v16f16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), + (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>; def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)), @@ -635,6 +640,8 @@ (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; +def : Pat<(v32f16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), + (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>; def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)), diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -149,8 +149,8 @@ // disable to ANDPS. // Class specifying the opcode map. -class Map val> { - bits<3> Value = val; +class Map val> { + bits<4> Value = val; } def OB : Map<0>; def TB : Map<1>; @@ -160,6 +160,8 @@ def XOP9 : Map<5>; def XOPA : Map<6>; def ThreeDNow : Map<7>; +def T_MAP5 : Map<8>; +def T_MAP6 : Map<9>; // Class specifying the encoding class Encoding val> { @@ -204,6 +206,16 @@ class XOP9 { Map OpMap = XOP9; Prefix OpPrefix = PS; } class XOPA { Map OpMap = XOPA; Prefix OpPrefix = PS; } class ThreeDNow { Map OpMap = ThreeDNow; } +class T_MAP5 { Map OpMap = T_MAP5; } +class T_MAP5PS : T_MAP5 { Prefix OpPrefix = PS; } // none +class T_MAP5PD : T_MAP5 { Prefix OpPrefix = PD; } // 0x66 +class T_MAP5XS : T_MAP5 { Prefix OpPrefix = XS; } // 0xF3 +class T_MAP5XD : T_MAP5 { Prefix OpPrefix = XD; } // 0xF2 +class T_MAP6 { Map OpMap = T_MAP6; } +class T_MAP6PS : T_MAP6 { Prefix OpPrefix = PS; } +class T_MAP6PD : T_MAP6 { Prefix OpPrefix = PD; } +class T_MAP6XS : T_MAP6 { Prefix OpPrefix = XS; } +class T_MAP6XD : T_MAP6 { Prefix OpPrefix = XD; } class OBXS { Prefix OpPrefix = XS; } class PS : TB { Prefix OpPrefix = PS; } class PD : TB { Prefix OpPrefix = PD; } @@ -301,7 +313,7 @@ Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have? bits<3> OpPrefixBits = OpPrefix.Value; Map OpMap = OB; // Which opcode map does this inst have? - bits<3> OpMapBits = OpMap.Value; + bits<4> OpMapBits = OpMap.Value; bit hasREX_WPrefix = 0; // Does this inst require the REX.W prefix? FPFormat FPForm = NotFP; // What flavor of FP instruction is this? bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? @@ -360,28 +372,28 @@ let TSFlags{10-9} = AdSizeBits; // No need for 3rd bit, we don't need to distinguish NoPrfx from PS. let TSFlags{12-11} = OpPrefixBits{1-0}; - let TSFlags{15-13} = OpMapBits; - let TSFlags{16} = hasREX_WPrefix; - let TSFlags{20-17} = ImmT.Value; - let TSFlags{23-21} = FPForm.Value; - let TSFlags{24} = hasLockPrefix; - let TSFlags{25} = hasREPPrefix; - let TSFlags{27-26} = ExeDomain.Value; - let TSFlags{29-28} = OpEncBits; - let TSFlags{37-30} = Opcode; + let TSFlags{16-13} = OpMapBits; + let TSFlags{17} = hasREX_WPrefix; + let TSFlags{21-18} = ImmT.Value; + let TSFlags{24-22} = FPForm.Value; + let TSFlags{25} = hasLockPrefix; + let TSFlags{26} = hasREPPrefix; + let TSFlags{28-27} = ExeDomain.Value; + let TSFlags{30-29} = OpEncBits; + let TSFlags{38-31} = Opcode; // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0. - let TSFlags{38} = HasVEX_W; - let TSFlags{39} = hasVEX_4V; - let TSFlags{40} = hasVEX_L; - let TSFlags{41} = hasEVEX_K; - let TSFlags{42} = hasEVEX_Z; - let TSFlags{43} = hasEVEX_L2; - let TSFlags{44} = hasEVEX_B; + let TSFlags{39} = HasVEX_W; + let TSFlags{40} = hasVEX_4V; + let TSFlags{41} = hasVEX_L; + let TSFlags{42} = hasEVEX_K; + let TSFlags{43} = hasEVEX_Z; + let TSFlags{44} = hasEVEX_L2; + let TSFlags{45} = hasEVEX_B; // If we run out of TSFlags bits, it's possible to encode this in 3 bits. - let TSFlags{51-45} = CD8_Scale; - let TSFlags{52} = hasEVEX_RC; - let TSFlags{53} = hasNoTrackPrefix; - let TSFlags{54} = ExplicitVEXPrefix; + let TSFlags{52-46} = CD8_Scale; + let TSFlags{53} = hasEVEX_RC; + let TSFlags{54} = hasNoTrackPrefix; + let TSFlags{55} = ExplicitVEXPrefix; } class PseudoI pattern> diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -417,6 +417,11 @@ SDTCisVT<1, v4f32>, SDTCisVT<2, v4f32>]>>; +def X86Movsh : SDNode<"X86ISD::MOVSH", + SDTypeProfile<1, 2, [SDTCisVT<0, v8f16>, + SDTCisVT<1, v8f16>, + SDTCisVT<2, v8f16>]>>; + def X86Movlhps : SDNode<"X86ISD::MOVLHPS", SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4f32>, @@ -796,6 +801,7 @@ //===----------------------------------------------------------------------===// // 128-bit load pattern fragments +def loadv8f16 : PatFrag<(ops node:$ptr), (v8f16 (load node:$ptr))>; def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>; def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>; def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>; @@ -804,6 +810,7 @@ def loadv16i8 : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>; // 256-bit load pattern fragments +def loadv16f16 : PatFrag<(ops node:$ptr), (v16f16 (load node:$ptr))>; def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>; def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>; def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>; @@ -812,6 +819,7 @@ def loadv32i8 : PatFrag<(ops node:$ptr), (v32i8 (load node:$ptr))>; // 512-bit load pattern fragments +def loadv32f16 : PatFrag<(ops node:$ptr), (v32f16 (load node:$ptr))>; def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>; def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>; def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>; @@ -823,6 +831,10 @@ def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>; +def extloadv2f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv4f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv8f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; +def extloadv16f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>; // Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), @@ -839,6 +851,8 @@ // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 +def alignedloadv8f16 : PatFrag<(ops node:$ptr), + (v8f16 (alignedload node:$ptr))>; def alignedloadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (alignedload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), @@ -854,6 +868,8 @@ // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 +def alignedloadv16f16 : PatFrag<(ops node:$ptr), + (v16f16 (alignedload node:$ptr))>; def alignedloadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (alignedload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), @@ -868,6 +884,8 @@ (v32i8 (alignedload node:$ptr))>; // 512-bit aligned load pattern fragments +def alignedloadv32f16 : PatFrag<(ops node:$ptr), + (v32f16 (alignedload node:$ptr))>; def alignedloadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (alignedload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), @@ -926,6 +944,11 @@ def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>; def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>; +def X86vzload16 : PatFrag<(ops node:$src), + (X86vzld node:$src), [{ + return cast(N)->getMemoryVT().getStoreSize() == 2; +}]>; + def X86vzload32 : PatFrag<(ops node:$src), (X86vzld node:$src), [{ return cast(N)->getMemoryVT().getStoreSize() == 4; @@ -976,6 +999,10 @@ // only load a single element. // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing // the simple_load case. +def sse_load_f16 : PatFrags<(ops node:$ptr), + [(v8f16 (simple_load node:$ptr)), + (v8f16 (X86vzload16 node:$ptr)), + (v8f16 (scalar_to_vector (loadf16 node:$ptr)))]>; def sse_load_f32 : PatFrags<(ops node:$ptr), [(v4f32 (simple_load node:$ptr)), (v4f32 (X86vzload32 node:$ptr)), @@ -985,9 +1012,13 @@ (v2f64 (X86vzload64 node:$ptr)), (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>; +def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; +def fp16imm0 : PatLeaf<(f16 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; def fp32imm0 : PatLeaf<(f32 fpimm), [{ return N->isExactlyValue(+0.0); diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -699,6 +699,8 @@ return true; case X86::MOV16rm: case X86::KMOVWkm: + case X86::VMOVSHZrm: + case X86::VMOVSHZrm_alt: MemBytes = 2; return true; case X86::MOV32rm: @@ -795,6 +797,7 @@ return true; case X86::MOV16mr: case X86::KMOVWmk: + case X86::VMOVSHZmr: MemBytes = 2; return true; case X86::MOV32mr: @@ -980,6 +983,7 @@ case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::AVX512_FsFLD0SD: + case X86::AVX512_FsFLD0SH: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0F128: case X86::AVX_SET0: @@ -1047,6 +1051,8 @@ case X86::VMOVSSZrm_alt: case X86::VMOVSDZrm: case X86::VMOVSDZrm_alt: + case X86::VMOVSHZrm: + case X86::VMOVSHZrm_alt: case X86::VMOVAPDZ128rm: case X86::VMOVAPDZ256rm: case X86::VMOVAPDZrm: @@ -3605,6 +3611,10 @@ case 2: if (X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; + if (X86::FR16XRegClass.hasSubClassEq(RC)) { + assert(STI.hasFP16()); + return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr; + } assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: @@ -4755,6 +4765,7 @@ return true; } case X86::AVX512_128_SET0: + case X86::AVX512_FsFLD0SH: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0SD: case X86::AVX512_FsFLD0F128: { @@ -6101,6 +6112,9 @@ case X86::AVX512_FsFLD0SS: Alignment = Align(4); break; + case X86::AVX512_FsFLD0SH: + Alignment = Align(2); + break; default: return nullptr; } @@ -6136,6 +6150,7 @@ case X86::AVX512_256_SET0: case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: + case X86::AVX512_FsFLD0SH: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: case X86::FsFLD0SS: @@ -6174,6 +6189,8 @@ Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128) Ty = Type::getFP128Ty(MF.getFunction().getContext()); + else if (Opc == X86::AVX512_FsFLD0SH) + Ty = Type::getHalfTy(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 16); diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -421,6 +421,7 @@ def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>; def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>; def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>; +def f16mem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>; def f32mem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>; def f64mem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>; def f80mem : X86MemOperand<"printtbytemem", X86Mem80AsmOperand>; @@ -919,6 +920,7 @@ def HasVNNI : Predicate<"Subtarget->hasVNNI()">; def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">; def HasBF16 : Predicate<"Subtarget->hasBF16()">; +def HasFP16 : Predicate<"Subtarget->hasFP16()">; def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">; def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">; @@ -1193,6 +1195,7 @@ }]>; def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>; +def loadf16 : PatFrag<(ops node:$ptr), (f16 (load node:$ptr))>; def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>; def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>; def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>; diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td --- a/llvm/lib/Target/X86/X86InstrVecCompiler.td +++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td @@ -25,6 +25,8 @@ let Predicates = [HasAVX512] in { // A vector extract of the first f32/f64 position is a subregister copy + def : Pat<(f16 (extractelt (v8f16 VR128X:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v8f16 VR128X:$src), FR16X)>; def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>; def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))), @@ -32,6 +34,8 @@ } let Predicates = [NoVLX] in { + def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), + (COPY_TO_REGCLASS FR16X:$src, VR128)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32:$src)), (COPY_TO_REGCLASS FR32:$src, VR128)>; @@ -41,6 +45,8 @@ } let Predicates = [HasVLX] in { + def : Pat<(v8f16 (scalar_to_vector FR16X:$src)), + (COPY_TO_REGCLASS FR16X:$src, VR128X)>; // Implicitly promote a 32-bit scalar to a vector. def : Pat<(v4f32 (scalar_to_vector FR32X:$src)), (COPY_TO_REGCLASS FR32X:$src, VR128X)>; @@ -74,6 +80,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -85,6 +92,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // A 128-bit subvector extract from the first 512-bit vector position is a // subregister copy that needs no instruction. Likewise, a 128-bit subvector @@ -96,6 +104,7 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +defm : subvector_subreg_lowering; // If we're inserting into an all zeros vector, just use a plain move which @@ -159,6 +168,12 @@ defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>; } +let Predicates = [HasFP16, HasVLX] in { + defm : subvec_zero_lowering<"APSZ128", VR128X, v16f16, v8f16, v8i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ128", VR128X, v32f16, v8f16, v16i32, sub_xmm>; + defm : subvec_zero_lowering<"APSZ256", VR256X, v32f16, v16f16, v16i32, sub_ymm>; +} + class maskzeroupper : PatLeaf<(vt RC:$src), [{ return isMaskZeroExtended(N); diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -567,9 +567,9 @@ // Generic vector registers: VR64 and VR128. // Ensure that float types are declared first - only float is legal on SSE1. def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>; -def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], +def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32)>; -def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], +def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 15)>; // Status flags registers. @@ -587,7 +587,7 @@ } // AVX-512 vector/mask registers. -def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64], +def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v64i8, v32i16, v16i32, v8i64], 512, (sequence "ZMM%u", 0, 31)>; // Represents the lower 16 registers that have VEX/legacy encodable subregs. @@ -599,10 +599,12 @@ def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; +def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>; + // Extended VR128 and VR256 for AVX-512 instructions -def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128], +def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128], 128, (add FR32X)>; -def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], +def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; // Mask registers diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -87,8 +87,10 @@ } // Multiclass that wraps X86SchedWriteWidths for each fp vector type. -class X86SchedWriteSizes { + X86SchedWriteWidths PH = sPH; X86SchedWriteWidths PS = sPS; X86SchedWriteWidths PD = sPD; } @@ -681,20 +683,22 @@ WriteVarBlendY, WriteVarBlendZ>; // Vector size wrappers. +// FIXME: Currently PH uses the same schedule method as PS. +// We may refine them later. def SchedWriteFAddSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFCmpSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFMulSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFDivSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFSqrtSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFLogicSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; def SchedWriteFShuffleSizes - : X86SchedWriteSizes; + : X86SchedWriteSizes; //===----------------------------------------------------------------------===// // Generic Processor Scheduler Models. diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -353,6 +353,9 @@ /// Processor has AVX-512 Vector Length eXtenstions bool HasVLX = false; + /// Processor has AVX-512 16 bit floating-point extenstions + bool HasFP16 = false; + /// Processor has PKU extenstions bool HasPKU = false; @@ -742,6 +745,7 @@ bool hasDQI() const { return HasDQI; } bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } + bool hasFP16() const { return HasFP16; } bool hasPKU() const { return HasPKU; } bool hasVNNI() const { return HasVNNI; } bool hasBF16() const { return HasBF16; } diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -1199,6 +1199,29 @@ LT.first = NumOfDests * NumOfShufflesPerDest; } + static const CostTblEntry AVX512FP16ShuffleTbl[] = { + {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw + {TTI::SK_Broadcast, MVT::v8f16, 1}, // vpbroadcastw + + {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw + {TTI::SK_Reverse, MVT::v8f16, 1}, // vpshufb + + {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw + {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // vpshufb + + {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w + {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2} // vpermt2w + }; + + if (!ST->useSoftFloat() && ST->hasFP16()) + if (const auto *Entry = + CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry AVX512VBMIShuffleTbl[] = { {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb @@ -4693,6 +4716,9 @@ if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) return true; + if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16()) + return true; + if (!ScalarTy->isIntegerTy()) return false; @@ -5150,12 +5176,13 @@ unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) { - auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { + auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) { Type *EltTy = cast(VecTy)->getElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || + (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy())) return HasBW; return false; }; diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll @@ -0,0 +1,140 @@ +; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mattr=avx512fp16 %s 2>&1 | FileCheck %s +target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" +target triple = "i386-unknown-linux-gnu" + +@src = common local_unnamed_addr global [120 x half] zeroinitializer, align 4 +@dst = common local_unnamed_addr global [120 x half] zeroinitializer, align 4 + +; Function Attrs: norecurse nounwind +define void @stride8(half %k, i32 %width_) { +entry: + +; CHECK: Found an estimated cost of 148 for VF 32 For instruction: %0 = load half + + %cmp72 = icmp sgt i32 %width_, 0 + br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ] + %arrayidx = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %i.073 + %0 = load half, half* %arrayidx, align 4 + %mul = fmul fast half %0, %k + %arrayidx2 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %i.073 + %1 = load half, half* %arrayidx2, align 4 + %add3 = fadd fast half %1, %mul + store half %add3, half* %arrayidx2, align 4 + %add4 = or i32 %i.073, 1 + %arrayidx5 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add4 + %2 = load half, half* %arrayidx5, align 4 + %mul6 = fmul fast half %2, %k + %arrayidx8 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add4 + %3 = load half, half* %arrayidx8, align 4 + %add9 = fadd fast half %3, %mul6 + store half %add9, half* %arrayidx8, align 4 + %add10 = or i32 %i.073, 2 + %arrayidx11 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add10 + %4 = load half, half* %arrayidx11, align 4 + %mul12 = fmul fast half %4, %k + %arrayidx14 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add10 + %5 = load half, half* %arrayidx14, align 4 + %add15 = fadd fast half %5, %mul12 + store half %add15, half* %arrayidx14, align 4 + %add16 = or i32 %i.073, 3 + %arrayidx17 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add16 + %6 = load half, half* %arrayidx17, align 4 + %mul18 = fmul fast half %6, %k + %arrayidx20 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add16 + %7 = load half, half* %arrayidx20, align 4 + %add21 = fadd fast half %7, %mul18 + store half %add21, half* %arrayidx20, align 4 + %add22 = or i32 %i.073, 4 + %arrayidx23 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add22 + %8 = load half, half* %arrayidx23, align 4 + %mul24 = fmul fast half %8, %k + %arrayidx26 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add22 + %9 = load half, half* %arrayidx26, align 4 + %add27 = fadd fast half %9, %mul24 + store half %add27, half* %arrayidx26, align 4 + %add28 = or i32 %i.073, 5 + %arrayidx29 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add28 + %10 = load half, half* %arrayidx29, align 4 + %mul30 = fmul fast half %10, %k + %arrayidx32 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add28 + %11 = load half, half* %arrayidx32, align 4 + %add33 = fadd fast half %11, %mul30 + store half %add33, half* %arrayidx32, align 4 + %add34 = or i32 %i.073, 6 + %arrayidx35 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add34 + %12 = load half, half* %arrayidx35, align 4 + %mul36 = fmul fast half %12, %k + %arrayidx38 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add34 + %13 = load half, half* %arrayidx38, align 4 + %add39 = fadd fast half %13, %mul36 + store half %add39, half* %arrayidx38, align 4 + %add40 = or i32 %i.073, 7 + %arrayidx41 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add40 + %14 = load half, half* %arrayidx41, align 4 + %mul42 = fmul fast half %14, %k + %arrayidx44 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add40 + %15 = load half, half* %arrayidx44, align 4 + %add45 = fadd fast half %15, %mul42 + store half %add45, half* %arrayidx44, align 4 + %add46 = add nuw nsw i32 %i.073, 8 + %cmp = icmp slt i32 %add46, %width_ + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +; Function Attrs: norecurse nounwind +define void @stride3(half %k, i32 %width_) { +entry: + +; CHECK: Found an estimated cost of 18 for VF 32 For instruction: %0 = load half + + %cmp27 = icmp sgt i32 %width_, 0 + br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ] + %arrayidx = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %i.028 + %0 = load half, half* %arrayidx, align 4 + %mul = fmul fast half %0, %k + %arrayidx2 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %i.028 + %1 = load half, half* %arrayidx2, align 4 + %add3 = fadd fast half %1, %mul + store half %add3, half* %arrayidx2, align 4 + %add4 = add nuw nsw i32 %i.028, 1 + %arrayidx5 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add4 + %2 = load half, half* %arrayidx5, align 4 + %mul6 = fmul fast half %2, %k + %arrayidx8 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add4 + %3 = load half, half* %arrayidx8, align 4 + %add9 = fadd fast half %3, %mul6 + store half %add9, half* %arrayidx8, align 4 + %add10 = add nuw nsw i32 %i.028, 2 + %arrayidx11 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add10 + %4 = load half, half* %arrayidx11, align 4 + %mul12 = fmul fast half %4, %k + %arrayidx14 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add10 + %5 = load half, half* %arrayidx14, align 4 + %add15 = fadd fast half %5, %mul12 + store half %add15, half* %arrayidx14, align 4 + %add16 = add nuw nsw i32 %i.028, 3 + %cmp = icmp slt i32 %add16, %width_ + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s + +define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) { +; CHECK-LABEL: 'test_vXf16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer + %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer + %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer + %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer + %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s + +define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) { +; CHECK-LABEL: 'test_vXf16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> + %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> + %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> + %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> + %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s + +define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024) { +; CHECK-LABEL: 'test_vXf16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> + %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> + %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> + %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> + ret void +} diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s + +define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024, <8 x half> %src128_1, <16 x half> %src256_1, <32 x half> %src512_1, <64 x half> %src1024_1) { +; CHECK-LABEL: 'test_vXf16' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> + %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> + %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> + %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> + ret void +} diff --git a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir --- a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir +++ b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir @@ -28,8 +28,8 @@ liveins: $rdi, $rsi ; CHECK-LABEL: name: test - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi, - INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, + INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RETQ killed $rax ... @@ -45,8 +45,8 @@ ; Verify that the register ties are preserved. ; CHECK-LABEL: name: test2 - ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags - INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags $rax = MOV64rr killed $rsi RETQ killed $rax ... diff --git a/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll b/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK + +define <8 x half> @extract_v16f16_v8f16_0(<16 x half> %x) { +; CHECK-LABEL: extract_v16f16_v8f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <16 x half> %x, <16 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <8 x half> @extract_v16f16_v8f16_1(<16 x half> %x) { +; CHECK-LABEL: extract_v16f16_v8f16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <16 x half> %x, <16 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <8 x half> @extract_v32f16_v8f16_0(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v8f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <8 x half> @extract_v32f16_v8f16_1(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v8f16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <8 x half> @extract_v32f16_v8f16_2(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v8f16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <8 x half> @extract_v32f16_v8f16_3(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v8f16_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> + ret <8 x half> %a +} + +define <16 x half> @extract_v32f16_v81616_0(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v81616_0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <16 x i32> + ret <16 x half> %a +} + +define <16 x half> @extract_v32f16_v81616_1(<32 x half> %x) { +; CHECK-LABEL: extract_v32f16_v81616_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: retq + %a = shufflevector <32 x half> %x, <32 x half> undef, <16 x i32> + ret <16 x half> %a +} + +define <16 x half> @concat_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: concat_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %x, <8 x half> %y, <16 x i32> + ret <16 x half> %a +} + +define <32 x half> @concat_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: concat_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = shufflevector <16 x half> %x, <16 x half> %y, <32 x i32> + ret <32 x half> %a +} + +define <16 x half> @concat_zero_v8f16(<8 x half> %x, <8 x half> %y) { +; CHECK-LABEL: concat_zero_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm0, %xmm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %x, <8 x half> zeroinitializer, <16 x i32> + ret <16 x half> %a +} + +define <32 x half> @concat_zero_v16f16(<16 x half> %x, <16 x half> %y) { +; CHECK-LABEL: concat_zero_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %ymm0, %ymm0 +; CHECK-NEXT: retq + %a = shufflevector <16 x half> %x, <16 x half> zeroinitializer, <32 x i32> + ret <32 x half> %a +} + +define <32 x half> @insert_v8f16_v32f16_0(<32 x half> %x, <8 x half> %y) { +; CHECK-LABEL: insert_v8f16_v32f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> + %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> + ret <32 x half> %b +} + +define <32 x half> @insert_v8f16_v32f16_1(<32 x half> %x, <8 x half> %y) { +; CHECK-LABEL: insert_v8f16_v32f16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> + %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> + ret <32 x half> %b +} + +define <32 x half> @insert_v8f16_v32f16_2(<32 x half> %x, <8 x half> %y) { +; CHECK-LABEL: insert_v8f16_v32f16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $2, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> + %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> + ret <32 x half> %b +} + +define <32 x half> @insert_v8f16_v32f16_3(<32 x half> %x, <8 x half> %y) { +; CHECK-LABEL: insert_v8f16_v32f16_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq + %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> + %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> + ret <32 x half> %b +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -0,0 +1,1887 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86 + +define <8 x half> @broadcastph128(half* %x) { +; X64-LABEL: broadcastph128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpbroadcastw (%eax), %xmm0 +; X86-NEXT: retl + %l1 = load half, half* %x, align 2 + %vec = insertelement <8 x half> undef, half %l1, i32 0 + %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %res +} + +define <16 x half> @broadcastph256(half* %x) { +; X64-LABEL: broadcastph256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpbroadcastw (%eax), %ymm0 +; X86-NEXT: retl + %l1 = load half, half* %x, align 2 + %vec = insertelement <16 x half> undef, half %l1, i32 0 + %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer + ret <16 x half> %res +} + +define <32 x half> @broadcastph512(half* %x) { +; X64-LABEL: broadcastph512: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpbroadcastw (%eax), %zmm0 +; X86-NEXT: retl + %l1 = load half, half* %x, align 2 + %vec = insertelement <32 x half> undef, half %l1, i32 0 + %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer + ret <32 x half> %res +} + +define <8 x half> @broadcastph128_scalar(half %x) { +; X64-LABEL: broadcastph128_scalar: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph128_scalar: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %vec = insertelement <8 x half> undef, half %x, i32 0 + %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %res +} + +define <16 x half> @broadcastph256_scalar(half %x) { +; X64-LABEL: broadcastph256_scalar: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %xmm0, %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph256_scalar: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 +; X86-NEXT: retl + %vec = insertelement <16 x half> undef, half %x, i32 0 + %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer + ret <16 x half> %res +} + +define <32 x half> @broadcastph512_scalar(half %x) { +; X64-LABEL: broadcastph512_scalar: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %xmm0, %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: broadcastph512_scalar: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 +; X86-NEXT: retl + %vec = insertelement <32 x half> undef, half %x, i32 0 + %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer + ret <32 x half> %res +} + +define <8 x half> @broadcastph128_reg(<8 x half> %x) { +; CHECK-LABEL: broadcastph128_reg: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer + ret <8 x half> %res +} + +define <16 x half> @broadcastph256_reg(<16 x half> %x) { +; CHECK-LABEL: broadcastph256_reg: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer + ret <16 x half> %res +} + +define <32 x half> @broadcastph512_reg(<32 x half> %x) { +; CHECK-LABEL: broadcastph512_reg: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer + ret <32 x half> %res +} + +define i16 @test1(half %x) { +; X64-LABEL: test1: +; X64: # %bb.0: +; X64-NEXT: vmovw %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test1: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl + %res = bitcast half %x to i16 + ret i16 %res +} + +define <8 x i16> @test2(i16 %x) { +; X64-LABEL: test2: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test2: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <8 x i16>undef, i16 %x, i32 0 + ret <8 x i16>%res +} + +define <8 x i16> @test4(i16* %x) { +; X64-LABEL: test4: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpbroadcastw (%eax), %xmm0 +; X86-NEXT: retl + %y = load i16, i16* %x + %res = insertelement <8 x i16>undef, i16 %y, i32 0 + ret <8 x i16>%res +} + +define void @test5(half %x, half* %y) { +; X64-LABEL: test5: +; X64: # %bb.0: +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: test5: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + store half %x, half* %y, align 2 + ret void +} + +define half @test7(i16* %x) { +; X64-LABEL: test7: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test7: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: retl + %y = load i16, i16* %x + %res = bitcast i16 %y to half + ret half %res +} + +define <8 x i16> @test10(i16* %x) { +; X64-LABEL: test10: +; X64: # %bb.0: +; X64-NEXT: vmovw (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test10: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovw (%eax), %xmm0 +; X86-NEXT: retl + %y = load i16, i16* %x, align 2 + %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0 + ret <8 x i16>%res +} + +define <16 x i16> @test10b(i16* %x) { +; X64-LABEL: test10b: +; X64: # %bb.0: +; X64-NEXT: vmovw (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test10b: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovw (%eax), %xmm0 +; X86-NEXT: retl + %y = load i16, i16* %x, align 2 + %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0 + ret <16 x i16>%res +} + +define <32 x i16> @test10c(i16* %x) { +; X64-LABEL: test10c: +; X64: # %bb.0: +; X64-NEXT: vmovw (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test10c: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovw (%eax), %xmm0 +; X86-NEXT: retl + %y = load i16, i16* %x, align 2 + %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0 + ret <32 x i16>%res +} + +define <8 x half> @test11(half* %x) { +; X64-LABEL: test11: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test11: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: retl + %y = load half, half* %x, align 2 + %res = insertelement <8 x half>zeroinitializer, half %y, i32 0 + ret <8 x half>%res +} + +define <16 x half> @test11b(half* %x) { +; X64-LABEL: test11b: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test11b: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: retl + %y = load half, half* %x, align 2 + %res = insertelement <16 x half>zeroinitializer, half %y, i32 0 + ret <16 x half>%res +} + +define <32 x half> @test11c(half* %x) { +; X64-LABEL: test11c: +; X64: # %bb.0: +; X64-NEXT: vmovsh (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test11c: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh (%eax), %xmm0 +; X86-NEXT: retl + %y = load half, half* %x, align 2 + %res = insertelement <32 x half>zeroinitializer, half %y, i32 0 + ret <32 x half>%res +} + +define <8 x half> @test14(half %x) { +; X64-LABEL: test14: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test14: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <8 x half>zeroinitializer, half %x, i32 0 + ret <8 x half>%res +} + +define <16 x half> @test14b(half %x) { +; X64-LABEL: test14b: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test14b: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <16 x half>zeroinitializer, half %x, i32 0 + ret <16 x half>%res +} + +define <32 x half> @test14c(half %x) { +; X64-LABEL: test14c: +; X64: # %bb.0: +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vmovsh %xmm0, %xmm1, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test14c: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <32 x half>zeroinitializer, half %x, i32 0 + ret <32 x half>%res +} + +define <8 x i16> @test15(i16 %x) { +; X64-LABEL: test15: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test15: +; X86: # %bb.0: +; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0 + ret <8 x i16>%res +} + +define <16 x i16> @test16(i16 %x) { +; X64-LABEL: test16: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test16: +; X86: # %bb.0: +; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0 + ret <16 x i16>%res +} + +define <32 x i16> @test17(i16 %x) { +; X64-LABEL: test17: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test17: +; X86: # %bb.0: +; X86-NEXT: vmovw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0 + ret <32 x i16>%res +} + +define <8 x i16> @test18(i16 %x) { +; X64-LABEL: test18: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test18: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = insertelement <8 x i16> undef, i16 %x, i32 0 + ret <8 x i16>%res +} + +define <16 x i16> @test19(i16 %x) { +; X64-LABEL: test19: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test19: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 +; X86-NEXT: retl + %res = insertelement <16 x i16> undef, i16 %x, i32 0 + ret <16 x i16>%res +} + +define <32 x i16> @test20(i16 %x) { +; X64-LABEL: test20: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test20: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 +; X86-NEXT: retl + %res = insertelement <32 x i16> undef, i16 %x, i32 0 + ret <32 x i16>%res +} + +@g8f16 = external global <8 x half> +@g8f16u = external global <8 x half>, align 8 +@g16f16 = external global <16 x half> +@g16f16u = external global <16 x half>, align 8 +@g32f16 = external global <32 x half> +@g32f16u = external global <32 x half>, align 8 + +define <32 x half> @load32f16(<32 x half>* %a) { +; X64-LABEL: load32f16: +; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: load32f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovaps (%eax), %zmm0 +; X86-NEXT: retl + %res = load <32 x half>, <32 x half>* %a + ret <32 x half> %res +} + +define <32 x half> @load32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) { +; X64-LABEL: load32f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: load32f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i32 %c to <32 x i1> + %res0 = load <32 x half>, <32 x half>* %a + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b + ret <32 x half> %res +} + +define <32 x half> @load32f16maskz(<32 x half>* %a, i32 %c) { +; X64-LABEL: load32f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: load32f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i32 %c to <32 x i1> + %res0 = load <32 x half>, <32 x half>* %a + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <32 x half> @loadu32f16(<32 x half>* %a) { +; X64-LABEL: loadu32f16: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: loadu32f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %zmm0 +; X86-NEXT: retl + %res = load <32 x half>, <32 x half>* %a, align 8 + ret <32 x half> %res +} + +define <32 x half> @loadu32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) { +; X64-LABEL: loadu32f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: loadu32f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i32 %c to <32 x i1> + %res0 = load <32 x half>, <32 x half>* %a, align 8 + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b + ret <32 x half> %res +} + +define <32 x half> @loadu32f16maskz(<32 x half>* %a, i32 %c) { +; X64-LABEL: loadu32f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: loadu32f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i32 %c to <32 x i1> + %res0 = load <32 x half>, <32 x half>* %a, align 8 + %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define void @store32f16(<32 x half> %a) { +; X64-LABEL: store32f16: +; X64: # %bb.0: +; X64-NEXT: movq g32f16@GOTPCREL(%rip), %rax +; X64-NEXT: vmovaps %zmm0, (%rax) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: store32f16: +; X86: # %bb.0: +; X86-NEXT: vmovaps %zmm0, g32f16 +; X86-NEXT: vzeroupper +; X86-NEXT: retl + store <32 x half> %a, <32 x half>* @g32f16 + ret void +} + +define void @storeu32f16(<32 x half> %a) { +; X64-LABEL: storeu32f16: +; X64: # %bb.0: +; X64-NEXT: movq g32f16u@GOTPCREL(%rip), %rax +; X64-NEXT: vmovups %zmm0, (%rax) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: storeu32f16: +; X86: # %bb.0: +; X86-NEXT: vmovups %zmm0, g32f16u +; X86-NEXT: vzeroupper +; X86-NEXT: retl + store <32 x half> %a, <32 x half>* @g32f16u, align 8 + ret void +} + +declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>) +declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32, <32 x i1>, <32 x half>) + +define void @storeu32f16mask(<32 x i1> %mask, <32 x half>* %addr, <32 x half> %val) { +; X64-LABEL: storeu32f16mask: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %ymm0, %ymm0 +; X64-NEXT: vpmovb2m %ymm0, %k1 +; X64-NEXT: vmovdqu16 %zmm1, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: storeu32f16mask: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %ymm0, %ymm0 +; X86-NEXT: vpmovb2m %ymm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 %zmm1, (%eax) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl + call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %addr, i32 4, <32 x i1>%mask) + ret void +} + +define <32 x half> @maskloadu32f16(<32 x half>* %addr, <32 x half> %val, <32 x i1> %mask) { +; X64-LABEL: maskloadu32f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %ymm1, %ymm1 +; X64-NEXT: vpmovb2m %ymm1, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: maskloadu32f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %ymm1, %ymm1 +; X86-NEXT: vpmovb2m %ymm1, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} +; X86-NEXT: retl + %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> %val) + ret <32 x half> %res +} + +define <32 x half> @maskuloadu32f16(<32 x half>* %addr, <32 x i1> %mask) { +; X64-LABEL: maskuloadu32f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %ymm0, %ymm0 +; X64-NEXT: vpmovb2m %ymm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskuloadu32f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %ymm0, %ymm0 +; X86-NEXT: vpmovb2m %ymm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> undef) + ret <32 x half> %res +} + +define <32 x half> @maskzloadu32f16(<32 x half>* %addr, <32 x i1> %mask) { +; X64-LABEL: maskzloadu32f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %ymm0, %ymm0 +; X64-NEXT: vpmovb2m %ymm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskzloadu32f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %ymm0, %ymm0 +; X86-NEXT: vpmovb2m %ymm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl + %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer) + ret <32 x half> %res +} + +define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) { +; CHECK-LABEL: movrr32f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + ret <32 x half> %b +} + +define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) { +; X64-LABEL: movrrk32f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: movrrk32f16: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1} +; X86-NEXT: retl + %mask = bitcast i32 %msk to <32 x i1> + %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b + ret <32 x half> %res +} + +define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) { +; X64-LABEL: movrrkz32f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: movrrkz32f16: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: retl + %mask = bitcast i32 %msk to <32 x i1> + %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer + ret <32 x half> %res +} + +define <16 x half> @load16f16(<16 x half>* %a) { +; X64-LABEL: load16f16: +; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: load16f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovaps (%eax), %ymm0 +; X86-NEXT: retl + %res = load <16 x half>, <16 x half>* %a + ret <16 x half> %res +} + +define <16 x half> @load16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) { +; X64-LABEL: load16f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: load16f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i16 %c to <16 x i1> + %res0 = load <16 x half>, <16 x half>* %a + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b + ret <16 x half> %res +} + +define <16 x half> @load16f16maskz(<16 x half>* %a, i16 %c) { +; X64-LABEL: load16f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: load16f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i16 %c to <16 x i1> + %res0 = load <16 x half>, <16 x half>* %a + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <16 x half> @loadu16f16(<16 x half>* %a) { +; X64-LABEL: loadu16f16: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: loadu16f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %ymm0 +; X86-NEXT: retl + %res = load <16 x half>, <16 x half>* %a, align 8 + ret <16 x half> %res +} + +define <16 x half> @loadu16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) { +; X64-LABEL: loadu16f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: loadu16f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i16 %c to <16 x i1> + %res0 = load <16 x half>, <16 x half>* %a, align 8 + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b + ret <16 x half> %res +} + +define <16 x half> @loadu16f16maskz(<16 x half>* %a, i16 %c) { +; X64-LABEL: loadu16f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: loadu16f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i16 %c to <16 x i1> + %res0 = load <16 x half>, <16 x half>* %a, align 8 + %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define void @store16f16(<16 x half> %a) { +; X64-LABEL: store16f16: +; X64: # %bb.0: +; X64-NEXT: movq g16f16@GOTPCREL(%rip), %rax +; X64-NEXT: vmovaps %ymm0, (%rax) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: store16f16: +; X86: # %bb.0: +; X86-NEXT: vmovaps %ymm0, g16f16 +; X86-NEXT: vzeroupper +; X86-NEXT: retl + store <16 x half> %a, <16 x half>* @g16f16 + ret void +} + +define void @storeu16f16(<16 x half> %a) { +; X64-LABEL: storeu16f16: +; X64: # %bb.0: +; X64-NEXT: movq g16f16u@GOTPCREL(%rip), %rax +; X64-NEXT: vmovups %ymm0, (%rax) +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: storeu16f16: +; X86: # %bb.0: +; X86-NEXT: vmovups %ymm0, g16f16u +; X86-NEXT: vzeroupper +; X86-NEXT: retl + store <16 x half> %a, <16 x half>* @g16f16u, align 8 + ret void +} + +declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>) +declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32, <16 x i1>, <16 x half>) + +define void @storeu16f16mask(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) { +; X64-LABEL: storeu16f16mask: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %xmm0, %xmm0 +; X64-NEXT: vpmovb2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +; +; X86-LABEL: storeu16f16mask: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %xmm0, %xmm0 +; X86-NEXT: vpmovb2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 %ymm1, (%eax) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl + call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask) + ret void +} + +define <16 x half> @maskloadu16f16(<16 x half>* %addr, <16 x half> %val, <16 x i1> %mask) { +; X64-LABEL: maskloadu16f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %xmm1, %xmm1 +; X64-NEXT: vpmovb2m %xmm1, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: maskloadu16f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %xmm1, %xmm1 +; X86-NEXT: vpmovb2m %xmm1, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} +; X86-NEXT: retl + %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> %val) + ret <16 x half> %res +} + +define <16 x half> @maskuloadu16f16(<16 x half>* %addr, <16 x i1> %mask) { +; X64-LABEL: maskuloadu16f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %xmm0, %xmm0 +; X64-NEXT: vpmovb2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskuloadu16f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %xmm0, %xmm0 +; X86-NEXT: vpmovb2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> undef) + ret <16 x half> %res +} + +define <16 x half> @maskzloadu16f16(<16 x half>* %addr, <16 x i1> %mask) { +; X64-LABEL: maskzloadu16f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $7, %xmm0, %xmm0 +; X64-NEXT: vpmovb2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskzloadu16f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $7, %xmm0, %xmm0 +; X86-NEXT: vpmovb2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl + %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer) + ret <16 x half> %res +} + +define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) { +; CHECK-LABEL: movrr16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %ymm1, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + ret <16 x half> %b +} + +define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) { +; X64-LABEL: movrrk16f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: movrrk16f16: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpblendmw %ymm0, %ymm1, %ymm0 {%k1} +; X86-NEXT: retl + %mask = bitcast i16 %msk to <16 x i1> + %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b + ret <16 x half> %res +} + +define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) { +; X64-LABEL: movrrkz16f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: movrrkz16f16: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 %ymm0, %ymm0 {%k1} {z} +; X86-NEXT: retl + %mask = bitcast i16 %msk to <16 x i1> + %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer + ret <16 x half> %res +} + +define <8 x half> @load8f16(<8 x half>* %a) { +; X64-LABEL: load8f16: +; X64: # %bb.0: +; X64-NEXT: vmovaps (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: load8f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovaps (%eax), %xmm0 +; X86-NEXT: retl + %res = load <8 x half>, <8 x half>* %a + ret <8 x half> %res +} + +define <8 x half> @load8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) { +; X64-LABEL: load8f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: load8f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i8 %c to <8 x i1> + %res0 = load <8 x half>, <8 x half>* %a + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b + ret <8 x half> %res +} + +define <8 x half> @load8f16maskz(<8 x half>* %a, i8 %c) { +; X64-LABEL: load8f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: load8f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i8 %c to <8 x i1> + %res0 = load <8 x half>, <8 x half>* %a + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define <8 x half> @loadu8f16(<8 x half>* %a) { +; X64-LABEL: loadu8f16: +; X64: # %bb.0: +; X64-NEXT: vmovups (%rdi), %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: loadu8f16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovups (%eax), %xmm0 +; X86-NEXT: retl + %res = load <8 x half>, <8 x half>* %a, align 8 + ret <8 x half> %res +} + +define <8 x half> @loadu8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) { +; X64-LABEL: loadu8f16mask: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: loadu8f16mask: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} +; X86-NEXT: retl + %msk = bitcast i8 %c to <8 x i1> + %res0 = load <8 x half>, <8 x half>* %a, align 8 + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b + ret <8 x half> %res +} + +define <8 x half> @loadu8f16maskz(<8 x half>* %a, i8 %c) { +; X64-LABEL: loadu8f16maskz: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: loadu8f16maskz: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %msk = bitcast i8 %c to <8 x i1> + %res0 = load <8 x half>, <8 x half>* %a, align 8 + %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define void @store8f16(<8 x half> %a) { +; X64-LABEL: store8f16: +; X64: # %bb.0: +; X64-NEXT: movq g8f16@GOTPCREL(%rip), %rax +; X64-NEXT: vmovaps %xmm0, (%rax) +; X64-NEXT: retq +; +; X86-LABEL: store8f16: +; X86: # %bb.0: +; X86-NEXT: vmovaps %xmm0, g8f16 +; X86-NEXT: retl + store <8 x half> %a, <8 x half>* @g8f16 + ret void +} + +define void @storeu8f16(<8 x half> %a) { +; X64-LABEL: storeu8f16: +; X64: # %bb.0: +; X64-NEXT: movq g8f16u@GOTPCREL(%rip), %rax +; X64-NEXT: vmovups %xmm0, (%rax) +; X64-NEXT: retq +; +; X86-LABEL: storeu8f16: +; X86: # %bb.0: +; X86-NEXT: vmovups %xmm0, g8f16u +; X86-NEXT: retl + store <8 x half> %a, <8 x half>* @g8f16u, align 8 + ret void +} + +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) + +define void @storeu8f16mask(<8 x i1> %mask, <8 x half>* %addr, <8 x half> %val) { +; X64-LABEL: storeu8f16mask: +; X64: # %bb.0: +; X64-NEXT: vpsllw $15, %xmm0, %xmm0 +; X64-NEXT: vpmovw2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1} +; X64-NEXT: retq +; +; X86-LABEL: storeu8f16mask: +; X86: # %bb.0: +; X86-NEXT: vpsllw $15, %xmm0, %xmm0 +; X86-NEXT: vpmovw2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 %xmm1, (%eax) {%k1} +; X86-NEXT: retl + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %addr, i32 4, <8 x i1>%mask) + ret void +} + +define <8 x half> @maskloadu8f16(<8 x half>* %addr, <8 x half> %val, <8 x i1> %mask) { +; X64-LABEL: maskloadu8f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $15, %xmm1, %xmm1 +; X64-NEXT: vpmovw2m %xmm1, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: maskloadu8f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $15, %xmm1, %xmm1 +; X86-NEXT: vpmovw2m %xmm1, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} +; X86-NEXT: retl + %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> %val) + ret <8 x half> %res +} + +define <8 x half> @maskuloadu8f16(<8 x half>* %addr, <8 x i1> %mask) { +; X64-LABEL: maskuloadu8f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $15, %xmm0, %xmm0 +; X64-NEXT: vpmovw2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskuloadu8f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $15, %xmm0, %xmm0 +; X86-NEXT: vpmovw2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> undef) + ret <8 x half> %res +} + +define <8 x half> @maskzloadu8f16(<8 x half>* %addr, <8 x i1> %mask) { +; X64-LABEL: maskzloadu8f16: +; X64: # %bb.0: +; X64-NEXT: vpsllw $15, %xmm0, %xmm0 +; X64-NEXT: vpmovw2m %xmm0, %k1 +; X64-NEXT: vmovdqu16 (%rdi), %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: maskzloadu8f16: +; X86: # %bb.0: +; X86-NEXT: vpsllw $15, %xmm0, %xmm0 +; X86-NEXT: vpmovw2m %xmm0, %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovdqu16 (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl + %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer) + ret <8 x half> %res +} + +define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: movrr8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + ret <8 x half> %b +} + +define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) { +; X64-LABEL: movrrk8f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +; +; X86-LABEL: movrrk8f16: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} +; X86-NEXT: retl + %mask = bitcast i8 %msk to <8 x i1> + %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b + ret <8 x half> %res +} + +define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) { +; X64-LABEL: movrrkz8f16: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 +; X64-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: movrrkz8f16: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: retl + %mask = bitcast i8 %msk to <8 x i1> + %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer + ret <8 x half> %res +} + +define i16 @test_movw(half %x) { +; X64-LABEL: test_movw: +; X64: # %bb.0: +; X64-NEXT: vmovw %xmm0, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_movw: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl + %res = bitcast half %x to i16 + ret i16 %res +} + +define half @test_movw2(i16 %x) { +; X64-LABEL: test_movw2: +; X64: # %bb.0: +; X64-NEXT: vmovw %edi, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test_movw2: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl + %res = bitcast i16 %x to half + ret half %res +} + +; sext avoids having a truncate in front of the bitcast input due to calling +; convention or i16 op promotion. +define half @test_movw3(i8 %x) { +; X64-LABEL: test_movw3: +; X64: # %bb.0: +; X64-NEXT: movsbl %dil, %eax +; X64-NEXT: vmovw %eax, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: test_movw3: +; X86: # %bb.0: +; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovw %eax, %xmm0 +; X86-NEXT: retl + %z = sext i8 %x to i16 + %a = bitcast i16 %z to half + ret half %a +} + +define half @extract_f16_0(<8 x half> %x) { +; CHECK-LABEL: extract_f16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 0 + ret half %res +} + +define half @extract_f16_1(<8 x half> %x) { +; CHECK-LABEL: extract_f16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 1 + ret half %res +} + +define half @extract_f16_2(<8 x half> %x) { +; CHECK-LABEL: extract_f16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 2 + ret half %res +} + +define half @extract_f16_3(<8 x half> %x) { +; CHECK-LABEL: extract_f16_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 3 + ret half %res +} + +define half @extract_f16_4(<8 x half> %x) { +; CHECK-LABEL: extract_f16_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 4 + ret half %res +} + +define half @extract_f16_5(<8 x half> %x) { +; CHECK-LABEL: extract_f16_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 5 + ret half %res +} + +define half @extract_f16_6(<8 x half> %x) { +; CHECK-LABEL: extract_f16_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 6 + ret half %res +} + +define half @extract_f16_7(<8 x half> %x) { +; CHECK-LABEL: extract_f16_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x half> %x, i32 7 + ret half %res +} + +define i16 @extract_i16_0(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 0 + ret i16 %res +} + +define i16 @extract_i16_1(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 1 + ret i16 %res +} + +define i16 @extract_i16_2(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $2, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 2 + ret i16 %res +} + +define i16 @extract_i16_3(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $3, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 3 + ret i16 %res +} + +define i16 @extract_i16_4(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $4, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 4 + ret i16 %res +} + +define i16 @extract_i16_5(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $5, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 5 + ret i16 %res +} + +define i16 @extract_i16_6(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $6, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 6 + ret i16 %res +} + +define i16 @extract_i16_7(<8 x i16> %x) { +; CHECK-LABEL: extract_i16_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $7, %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 7 + ret i16 %res +} + +define void @extract_store_f16_0(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_0: +; X64: # %bb.0: +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_0: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 0 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_1(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_1: +; X64: # %bb.0: +; X64-NEXT: vpsrld $16, %xmm0, %xmm0 +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpsrld $16, %xmm0, %xmm0 +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 1 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_2(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_2: +; X64: # %bb.0: +; X64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 2 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_3(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_3: +; X64: # %bb.0: +; X64-NEXT: vpsrlq $48, %xmm0, %xmm0 +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_3: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 3 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_4(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_4: +; X64: # %bb.0: +; X64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 4 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_5(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_5: +; X64: # %bb.0: +; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_5: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 5 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_6(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_6: +; X64: # %bb.0: +; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_6: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 6 + store half %res, half* %y + ret void +} + +define void @extract_store_f16_7(<8 x half> %x, half* %y) { +; X64-LABEL: extract_store_f16_7: +; X64: # %bb.0: +; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovsh %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_f16_7: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vmovsh %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x half> %x, i32 7 + store half %res, half* %y + ret void +} + +define void @extract_store_i16_0(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_0: +; X64: # %bb.0: +; X64-NEXT: vpextrw $0, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_0: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $0, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 0 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_1(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_1: +; X64: # %bb.0: +; X64-NEXT: vpextrw $1, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_1: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $1, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 1 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_2(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_2: +; X64: # %bb.0: +; X64-NEXT: vpextrw $2, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $2, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 2 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_3(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_3: +; X64: # %bb.0: +; X64-NEXT: vpextrw $3, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_3: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $3, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 3 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_4(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_4: +; X64: # %bb.0: +; X64-NEXT: vpextrw $4, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_4: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $4, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 4 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_5(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_5: +; X64: # %bb.0: +; X64-NEXT: vpextrw $5, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_5: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $5, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 5 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_6(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_6: +; X64: # %bb.0: +; X64-NEXT: vpextrw $6, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_6: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $6, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 6 + store i16 %res, i16* %y + ret void +} + +define void @extract_store_i16_7(<8 x i16> %x, i16* %y) { +; X64-LABEL: extract_store_i16_7: +; X64: # %bb.0: +; X64-NEXT: vpextrw $7, %xmm0, (%rdi) +; X64-NEXT: retq +; +; X86-LABEL: extract_store_i16_7: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpextrw $7, %xmm0, (%eax) +; X86-NEXT: retl + %res = extractelement <8 x i16> %x, i32 7 + store i16 %res, i16* %y + ret void +} + +define i32 @extract_zext_i16_0(<8 x i16> %x) { +; CHECK-LABEL: extract_zext_i16_0: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 0 + %res2 = zext i16 %res to i32 + ret i32 %res2 +} + +define i32 @extract_zext_i16_1(<8 x i16> %x) { +; CHECK-LABEL: extract_zext_i16_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrw $1, %xmm0, %eax +; CHECK-NEXT: ret{{[l|q]}} + %res = extractelement <8 x i16> %x, i32 1 + %res2 = zext i16 %res to i32 + ret i32 %res2 +} + +define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) { +; X64-LABEL: build_vector_xxxxuuuu: +; X64: # %bb.0: +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; X64-NEXT: retq +; +; X86-LABEL: build_vector_xxxxuuuu: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; X86-NEXT: retl + %a = insertelement <8 x half> undef, half %a0, i32 0 + %b = insertelement <8 x half> %a, half %a1, i32 1 + %c = insertelement <8 x half> %b, half %a2, i32 2 + %d = insertelement <8 x half> %c, half %a3, i32 3 + ret <8 x half> %d +} + +define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) { +; X64-LABEL: build_vector_uuuuxxxx: +; X64: # %bb.0: +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpbroadcastq %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: build_vector_uuuuxxxx: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-NEXT: retl + %a = insertelement <8 x half> undef, half %a0, i32 4 + %b = insertelement <8 x half> %a, half %a1, i32 5 + %c = insertelement <8 x half> %b, half %a2, i32 6 + %d = insertelement <8 x half> %c, half %a3, i32 7 + ret <8 x half> %d +} + +define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { +; X64-LABEL: build_vector_xxxxxxxx: +; X64: # %bb.0: +; X64-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; X64-NEXT: retq +; +; X86-LABEL: build_vector_xxxxxxxx: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X86-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X86-NEXT: retl + %a = insertelement <8 x half> undef, half %a0, i32 0 + %b = insertelement <8 x half> %a, half %a1, i32 1 + %c = insertelement <8 x half> %b, half %a2, i32 2 + %d = insertelement <8 x half> %c, half %a3, i32 3 + %e = insertelement <8 x half> %d, half %a4, i32 4 + %f = insertelement <8 x half> %e, half %a5, i32 5 + %g = insertelement <8 x half> %f, half %a6, i32 6 + %h = insertelement <8 x half> %g, half %a7, i32 7 + ret <8 x half> %h +} + +define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) { +; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx: +; X64: # %bb.0: +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; X64-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; X64-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; X64-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: vpbroadcastq %xmm1, %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X64-NEXT: retq +; +; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx: +; X86: # %bb.0: +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm1 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm2 +; X86-NEXT: vmovsh {{[0-9]+}}(%esp), %xmm3 +; X86-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-NEXT: retl + %a = insertelement <16 x half> undef, half %a0, i32 0 + %b = insertelement <16 x half> %a, half %a1, i32 1 + %c = insertelement <16 x half> %b, half %a2, i32 2 + %d = insertelement <16 x half> %c, half %a3, i32 3 + %e = insertelement <16 x half> %d, half %a4, i32 12 + %f = insertelement <16 x half> %e, half %a5, i32 13 + %g = insertelement <16 x half> %f, half %a6, i32 14 + %h = insertelement <16 x half> %g, half %a7, i32 15 + ret <16 x half> %h +} + +define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: regression1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11] +; CHECK-NEXT: ret{{[l|q]}} + %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> + ret <8 x half> %res +} + +define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, i8* %4) { +; X64-LABEL: regression2: +; X64: # %bb.0: +; X64-NEXT: vmovw (%rsi), %xmm0 +; X64-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X64-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; X64-NEXT: retq +; +; X86-LABEL: regression2: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vmovw (%eax), %xmm0 +; X86-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] +; X86-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 +; X86-NEXT: retl + %6 = getelementptr i8, i8* %4, i64 0 + %7 = getelementptr i8, i8* %6, i64 0 + %8 = getelementptr i8, i8* %7, i64 0 + %9 = load i8, i8* %8, align 1 + %10 = getelementptr i8, i8* %8, i64 1 + %11 = addrspacecast i8* %10 to i8 addrspace(4)* + %12 = load i8, i8 addrspace(4)* %11, align 1 + %13 = insertelement <2 x i8> poison, i8 %9, i32 0 + %14 = insertelement <2 x i8> %13, i8 %12, i32 1 + %15 = uitofp <2 x i8> %14 to <2 x float> + %16 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> + %17 = shufflevector <4 x float> %16, <4 x float> , <4 x i32> + %18 = fmul contract <4 x float> %17, + ret <4 x float> %18 +} diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s + +define void @test_mscatter_v16f16(half* %base, <16 x i32> %index, <16 x half> %val) +; CHECK-LABEL: test_mscatter_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastq %rdi, %zmm3 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vpmovsxdq %ymm2, %zmm2 +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm4 +; CHECK-NEXT: vpaddq %zmm4, %zmm2, %zmm2 +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm3 +; CHECK-NEXT: vpaddq %zmm3, %zmm0, %zmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpsrld $16, %xmm1, %xmm3 +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm4 +; CHECK-NEXT: vmovq %xmm4, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 +; CHECK-NEXT: vpextrq $1, %xmm4, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm4 +; CHECK-NEXT: vmovq %xmm4, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrq $1, %xmm4, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; CHECK-NEXT: vmovq %xmm0, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrq $1, %xmm0, %rax +; CHECK-NEXT: vmovsh %xmm3, (%rax) +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm0 +; CHECK-NEXT: vmovq %xmm2, %rax +; CHECK-NEXT: vmovsh %xmm0, (%rax) +; CHECK-NEXT: vpsrld $16, %xmm0, %xmm1 +; CHECK-NEXT: vpextrq $1, %xmm2, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vmovq %xmm3, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm1 +; CHECK-NEXT: vpextrq $1, %xmm3, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; CHECK-NEXT: vextracti32x4 $2, %zmm2, %xmm3 +; CHECK-NEXT: vmovq %xmm3, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrq $1, %xmm3, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; CHECK-NEXT: vextracti32x4 $3, %zmm2, %xmm2 +; CHECK-NEXT: vmovq %xmm2, %rax +; CHECK-NEXT: vmovsh %xmm1, (%rax) +; CHECK-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: vpextrq $1, %xmm2, %rax +; CHECK-NEXT: vmovsh %xmm0, (%rax) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +{ + %gep = getelementptr half, half* %base, <16 x i32> %index + call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> %val, <16 x half*> %gep, i32 4, <16 x i1> ) + ret void +} +declare void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> , <16 x half*> , i32 , <16 x i1>) diff --git a/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll b/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512fp16 | FileCheck %s + +define dso_local void @test_v8f16_v32f16(<8 x half>* %x_addr, <32 x half>* %y_addr) { +; CHECK-LABEL: test_v8f16_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = load <8 x half>, <8 x half>* %x_addr, align 16 + %shuffle.i58 = shufflevector <8 x half> %0, <8 x half> %0, <32 x i32> + store <32 x half> %shuffle.i58, <32 x half>* %y_addr, align 64 + ret void +} + +define dso_local void @test_v8f16_v16f16(<8 x half>* %x_addr, <16 x half>* %y_addr) { +; CHECK-LABEL: test_v8f16_v16f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; CHECK-NEXT: vmovdqa %ymm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = load <8 x half>, <8 x half>* %x_addr, align 16 + %shuffle.i58 = shufflevector <8 x half> %0, <8 x half> %0, <16 x i32> + store <16 x half> %shuffle.i58, <16 x half>* %y_addr, align 64 + ret void +} + +define dso_local void @test_v16f16_v32f16(<16 x half>* %x_addr, <32 x half>* %y_addr) { +; CHECK-LABEL: test_v16f16_v32f16: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %0 = load <16 x half>, <16 x half>* %x_addr, align 16 + %shuffle.i58 = shufflevector <16 x half> %0, <16 x half> %0, <32 x i32> + store <32 x half> %shuffle.i58, <32 x half>* %y_addr, align 64 + ret void +} diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s + +define signext i16 @test_mm_cvtsi128_si16(<2 x i64> %A) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_cvtsi128_si16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovw %xmm0, %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retq +entry: + %0 = bitcast <2 x i64> %A to <8 x i16> + %vecext.i = extractelement <8 x i16> %0, i32 0 + ret i16 %vecext.i +} + +define <2 x i64> @test_mm_cvtsi16_si128(i16 signext %A) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_cvtsi16_si128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vmovw %edi, %xmm0 +; CHECK-NEXT: retq +entry: + %vecinit7.i = insertelement <8 x i16> , i16 %A, i32 0 + %0 = bitcast <8 x i16> %vecinit7.i to <2 x i64> + ret <2 x i64> %0 +} diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -3,17 +3,69 @@ ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE ; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 +; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=-sse | FileCheck %s --check-prefixes=X86 ; Check soft floating point conversion function calls. +@vf16 = common dso_local global half 0.000000e+00, align 2 @vf32 = common dso_local global float 0.000000e+00, align 4 @vf64 = common dso_local global double 0.000000e+00, align 8 @vf80 = common dso_local global x86_fp80 0xK00000000000000000000, align 8 @vf128 = common dso_local global fp128 0xL00000000000000000000000000000000, align 16 +define dso_local void @TestFPExtF16_F128() nounwind strictfp { +; X64-SSE-LABEL: TestFPExtF16_F128: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: pushq %rax +; X64-SSE-NEXT: movzwl vf16(%rip), %edi +; X64-SSE-NEXT: callq __gnu_h2f_ieee@PLT +; X64-SSE-NEXT: callq __extendsftf2@PLT +; X64-SSE-NEXT: movaps %xmm0, vf128(%rip) +; X64-SSE-NEXT: popq %rax +; X64-SSE-NEXT: retq +; +; X64-AVX512-LABEL: TestFPExtF16_F128: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovsh vf16(%rip), %xmm0 +; X64-AVX512-NEXT: callq __extendhftf2@PLT +; X64-AVX512-NEXT: vmovaps %xmm0, vf128(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq +; +; X86-LABEL: TestFPExtF16_F128: +; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi +; X86-NEXT: subl $24, %esp +; X86-NEXT: movzwl vf16, %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: fstps {{[0-9]+}}(%esp) +; X86-NEXT: wait +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: calll __extendsftf2 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, vf128+12 +; X86-NEXT: movl %edx, vf128+8 +; X86-NEXT: movl %ecx, vf128+4 +; X86-NEXT: movl %eax, vf128 +; X86-NEXT: addl $24, %esp +; X86-NEXT: popl %esi +; X86-NEXT: retl +entry: + %0 = load half, half* @vf16, align 2 + %conv = call fp128 @llvm.experimental.constrained.fpext.f128.f16(half %0, metadata !"fpexcept.strict") #0 + store fp128 %conv, fp128* @vf128, align 16 + ret void +} + define dso_local void @TestFPExtF32_F128() nounwind strictfp { ; X64-SSE-LABEL: TestFPExtF32_F128: ; X64-SSE: # %bb.0: # %entry @@ -162,6 +214,44 @@ ret void } +define dso_local void @TestFPTruncF128_F16() nounwind strictfp { +; X64-SSE-LABEL: TestFPTruncF128_F16: +; X64-SSE: # %bb.0: # %entry +; X64-SSE-NEXT: pushq %rax +; X64-SSE-NEXT: movaps vf128(%rip), %xmm0 +; X64-SSE-NEXT: callq __trunctfhf2@PLT +; X64-SSE-NEXT: movw %ax, vf16(%rip) +; X64-SSE-NEXT: popq %rax +; X64-SSE-NEXT: retq +; +; X64-AVX512-LABEL: TestFPTruncF128_F16: +; X64-AVX512: # %bb.0: # %entry +; X64-AVX512-NEXT: pushq %rax +; X64-AVX512-NEXT: vmovaps vf128(%rip), %xmm0 +; X64-AVX512-NEXT: callq __trunctfhf2@PLT +; X64-AVX512-NEXT: vmovsh %xmm0, vf16(%rip) +; X64-AVX512-NEXT: popq %rax +; X64-AVX512-NEXT: retq +; +; X86-LABEL: TestFPTruncF128_F16: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl vf128+12 +; X86-NEXT: pushl vf128+8 +; X86-NEXT: pushl vf128+4 +; X86-NEXT: pushl vf128 +; X86-NEXT: calll __trunctfhf2 +; X86-NEXT: addl $16, %esp +; X86-NEXT: movw %ax, vf16 +; X86-NEXT: addl $12, %esp +; X86-NEXT: retl +entry: + %0 = load fp128, fp128* @vf128, align 16 + %conv = call half @llvm.experimental.constrained.fptrunc.f16.f128(fp128 %0, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 + store half %conv, half* @vf16, align 2 + ret void +} + define dso_local void @TestFPTruncF128_F32() nounwind strictfp { ; X64-SSE-LABEL: TestFPTruncF128_F32: ; X64-SSE: # %bb.0: # %entry @@ -957,9 +1047,11 @@ attributes #0 = { strictfp } +declare half @llvm.experimental.constrained.fptrunc.f16.f128(fp128, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.f128(fp128, metadata, metadata) declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata) declare x86_fp80 @llvm.experimental.constrained.fptrunc.f80.f128(fp128, metadata, metadata) +declare fp128 @llvm.experimental.constrained.fpext.f128.f16(half, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata) declare fp128 @llvm.experimental.constrained.fpext.f128.f80(x86_fp80, metadata) diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s + +; This test checks that only a single jne gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 { +; CHECK-LABEL: foo3: +; CHECK: jne +; CHECK-NOT: jne +entry: + %spec.select = select i1 %sign, <32 x half> %a, <32 x half> %b + ret <32 x half> %spec.select +} + +; This test checks that only a single jne gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +define dso_local <16 x half> @foo4(<16 x half> %a, <16 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 { +; CHECK-LABEL: foo4: +; CHECK: jne +; CHECK-NOT: jne +entry: + %spec.select = select i1 %sign, <16 x half> %a, <16 x half> %b + ret <16 x half> %spec.select +} + +; This test checks that only a single jne gets generated in the final code +; for lowering the CMOV pseudos that get created for this IR. +define dso_local <8 x half> @foo5(<8 x half> %a, <8 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 { +; CHECK-LABEL: foo5: +; CHECK: jne +; CHECK-NOT: jne +entry: + %spec.select = select i1 %sign, <8 x half> %a, <8 x half> %b + ret <8 x half> %spec.select +} diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir --- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir +++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir @@ -340,7 +340,7 @@ ; CHECK: CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags ; CHECK: undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0 ; CHECK: [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags - ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %102, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + ; CHECK: INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %102, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags ; CHECK: LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ; CHECK: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp ; CHECK: $rdi = COPY [[COPY4]] @@ -456,7 +456,7 @@ %63:gr64 = NOT64r %63 CMP64rr %63, %31, implicit-def $eflags %63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags - INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %53, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags + INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %53, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1) ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp $rdi = COPY %64 diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt @@ -0,0 +1,78 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL + +# ATT: vmovsh %xmm28, %xmm29, %xmm30 +# INTEL: vmovsh xmm30, xmm29, xmm28 +0x62,0x05,0x16,0x00,0x10,0xf4 + +# ATT: vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7} +# INTEL: vmovsh xmm30 {k7}, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7e,0x0f,0x10,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmovsh (%r9), %xmm30 +# INTEL: vmovsh xmm30, word ptr [r9] +0x62,0x45,0x7e,0x08,0x10,0x31 + +# ATT: vmovsh 254(%rcx), %xmm30 +# INTEL: vmovsh xmm30, word ptr [rcx + 254] +0x62,0x65,0x7e,0x08,0x10,0x71,0x7f + +# ATT: vmovsh -256(%rdx), %xmm30 {%k7} {z} +# INTEL: vmovsh xmm30 {k7} {z}, word ptr [rdx - 256] +0x62,0x65,0x7e,0x8f,0x10,0x72,0x80 + +# ATT: vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7} +# INTEL: vmovsh word ptr [rbp + 8*r14 + 268435456] {k7}, xmm30 +0x62,0x25,0x7e,0x0f,0x11,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmovsh %xmm30, (%r9) +# INTEL: vmovsh word ptr [r9], xmm30 +0x62,0x45,0x7e,0x08,0x11,0x31 + +# ATT: vmovsh %xmm30, 254(%rcx) +# INTEL: vmovsh word ptr [rcx + 254], xmm30 +0x62,0x65,0x7e,0x08,0x11,0x71,0x7f + +# ATT: vmovsh %xmm30, -256(%rdx) {%k7} +# INTEL: vmovsh word ptr [rdx - 256] {k7}, xmm30 +0x62,0x65,0x7e,0x0f,0x11,0x72,0x80 + +# ATT: vmovw %r12d, %xmm30 +# INTEL: vmovw xmm30, r12d +0x62,0x45,0x7d,0x08,0x6e,0xf4 + +# ATT: vmovw %xmm30, %r12d +# INTEL: vmovw r12d, xmm30 +0x62,0x45,0x7d,0x08,0x7e,0xf4 + +# ATT: vmovw 268435456(%rbp,%r14,8), %xmm30 +# INTEL: vmovw xmm30, word ptr [rbp + 8*r14 + 268435456] +0x62,0x25,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmovw (%r9), %xmm30 +# INTEL: vmovw xmm30, word ptr [r9] +0x62,0x45,0x7d,0x08,0x6e,0x31 + +# ATT: vmovw 254(%rcx), %xmm30 +# INTEL: vmovw xmm30, word ptr [rcx + 254] +0x62,0x65,0x7d,0x08,0x6e,0x71,0x7f + +# ATT: vmovw -256(%rdx), %xmm30 +# INTEL: vmovw xmm30, word ptr [rdx - 256] +0x62,0x65,0x7d,0x08,0x6e,0x72,0x80 + +# ATT: vmovw %xmm30, 268435456(%rbp,%r14,8) +# INTEL: vmovw word ptr [rbp + 8*r14 + 268435456], xmm30 +0x62,0x25,0x7d,0x08,0x7e,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# ATT: vmovw %xmm30, (%r9) +# INTEL: vmovw word ptr [r9], xmm30 +0x62,0x45,0x7d,0x08,0x7e,0x31 + +# ATT: vmovw %xmm30, 254(%rcx) +# INTEL: vmovw word ptr [rcx + 254], xmm30 +0x62,0x65,0x7d,0x08,0x7e,0x71,0x7f + +# ATT: vmovw %xmm30, -256(%rdx) +# INTEL: vmovw word ptr [rdx - 256], xmm30 +0x62,0x65,0x7d,0x08,0x7e,0x72,0x80 diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/avx512fp16.s @@ -0,0 +1,77 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: vmovsh %xmm28, %xmm29, %xmm30 +// CHECK: encoding: [0x62,0x05,0x16,0x00,0x10,0xf4] + vmovsh %xmm28, %xmm29, %xmm30 + +// CHECK: vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7} +// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x10,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7} + +// CHECK: vmovsh (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7e,0x08,0x10,0x31] + vmovsh (%r9), %xmm30 + +// CHECK: vmovsh 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x10,0x71,0x7f] + vmovsh 254(%rcx), %xmm30 + +// CHECK: vmovsh -256(%rdx), %xmm30 {%k7} {z} +// CHECK: encoding: [0x62,0x65,0x7e,0x8f,0x10,0x72,0x80] + vmovsh -256(%rdx), %xmm30 {%k7} {z} + +// CHECK: vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7} +// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x11,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7} + +// CHECK: vmovsh %xmm30, (%r9) +// CHECK: encoding: [0x62,0x45,0x7e,0x08,0x11,0x31] + vmovsh %xmm30, (%r9) + +// CHECK: vmovsh %xmm30, 254(%rcx) +// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x11,0x71,0x7f] + vmovsh %xmm30, 254(%rcx) + +// CHECK: vmovsh %xmm30, -256(%rdx) {%k7} +// CHECK: encoding: [0x62,0x65,0x7e,0x0f,0x11,0x72,0x80] + vmovsh %xmm30, -256(%rdx) {%k7} + +// CHECK: vmovw %r12d, %xmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x6e,0xf4] + vmovw %r12d, %xmm30 + +// CHECK: vmovw %xmm30, %r12d +// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x7e,0xf4] + vmovw %xmm30, %r12d + +// CHECK: vmovw 268435456(%rbp,%r14,8), %xmm30 +// CHECK: encoding: [0x62,0x25,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovw 268435456(%rbp,%r14,8), %xmm30 + +// CHECK: vmovw (%r9), %xmm30 +// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x6e,0x31] + vmovw (%r9), %xmm30 + +// CHECK: vmovw 254(%rcx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x6e,0x71,0x7f] + vmovw 254(%rcx), %xmm30 + +// CHECK: vmovw -256(%rdx), %xmm30 +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x6e,0x72,0x80] + vmovw -256(%rdx), %xmm30 + +// CHECK: vmovw %xmm30, 268435456(%rbp,%r14,8) +// CHECK: encoding: [0x62,0x25,0x7d,0x08,0x7e,0xb4,0xf5,0x00,0x00,0x00,0x10] + vmovw %xmm30, 268435456(%rbp,%r14,8) + +// CHECK: vmovw %xmm30, (%r9) +// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x7e,0x31] + vmovw %xmm30, (%r9) + +// CHECK: vmovw %xmm30, 254(%rcx) +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x71,0x7f] + vmovw %xmm30, 254(%rcx) + +// CHECK: vmovw %xmm30, -256(%rdx) +// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x72,0x80] + vmovw %xmm30, -256(%rdx) diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s @@ -0,0 +1,77 @@ +// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: vmovsh xmm6, xmm5, xmm4 +// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x10,0xf4] + vmovsh xmm6, xmm5, xmm4 + +// CHECK: vmovsh xmm6 {k7}, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x10,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmovsh xmm6 {k7}, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmovsh xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x10,0x31] + vmovsh xmm6, word ptr [ecx] + +// CHECK: vmovsh xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x10,0x71,0x7f] + vmovsh xmm6, word ptr [ecx + 254] + +// CHECK: vmovsh xmm6 {k7} {z}, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x10,0x72,0x80] + vmovsh xmm6 {k7} {z}, word ptr [edx - 256] + +// CHECK: vmovsh word ptr [esp + 8*esi + 268435456] {k7}, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x11,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmovsh word ptr [esp + 8*esi + 268435456] {k7}, xmm6 + +// CHECK: vmovsh word ptr [ecx], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x11,0x31] + vmovsh word ptr [ecx], xmm6 + +// CHECK: vmovsh word ptr [ecx + 254], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x11,0x71,0x7f] + vmovsh word ptr [ecx + 254], xmm6 + +// CHECK: vmovsh word ptr [edx - 256] {k7}, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x11,0x72,0x80] + vmovsh word ptr [edx - 256] {k7}, xmm6 + +// CHECK: vmovw xmm6, edx +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0xf2] + vmovw xmm6, edx + +// CHECK: vmovw edx, xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0xf2] + vmovw edx, xmm6 + +// CHECK: vmovw xmm6, word ptr [esp + 8*esi + 268435456] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmovw xmm6, word ptr [esp + 8*esi + 268435456] + +// CHECK: vmovw xmm6, word ptr [ecx] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x31] + vmovw xmm6, word ptr [ecx] + +// CHECK: vmovw xmm6, word ptr [ecx + 254] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x71,0x7f] + vmovw xmm6, word ptr [ecx + 254] + +// CHECK: vmovw xmm6, word ptr [edx - 256] +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x72,0x80] + vmovw xmm6, word ptr [edx - 256] + +// CHECK: vmovw word ptr [esp + 8*esi + 268435456], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0xb4,0xf4,0x00,0x00,0x00,0x10] + vmovw word ptr [esp + 8*esi + 268435456], xmm6 + +// CHECK: vmovw word ptr [ecx], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x31] + vmovw word ptr [ecx], xmm6 + +// CHECK: vmovw word ptr [ecx + 254], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x71,0x7f] + vmovw word ptr [ecx + 254], xmm6 + +// CHECK: vmovw word ptr [edx - 256], xmm6 +// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x72,0x80] + vmovw word ptr [edx - 256], xmm6 diff --git a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir --- a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir +++ b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir @@ -28,7 +28,7 @@ bb.0: liveins: $xmm0, $xmm1, $xmm2, $xmm3 - ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** + ; FP16 reg is sub_reg of xmm %0:_(s16) = COPY $xmm0 ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** @@ -40,7 +40,7 @@ ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** %3:_(<8 x s32>) = COPY $xmm3 - ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** + ; FP16 reg is sub_reg of xmm $xmm0 = COPY %0 ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes *** diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h --- a/llvm/utils/TableGen/X86DisassemblerTables.h +++ b/llvm/utils/TableGen/X86DisassemblerTables.h @@ -41,7 +41,9 @@ /// [5] XOP9 map opcode /// [6] XOPA map opcode /// [7] 3dnow map opcode - std::unique_ptr Tables[8]; + /// [8] fixed length MAP5 opcode + /// [9] fixed length MAP6 opcode + std::unique_ptr Tables[10]; // Table of ModRM encodings. typedef std::map, unsigned> ModRMMapTy; diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp --- a/llvm/utils/TableGen/X86DisassemblerTables.cpp +++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp @@ -994,6 +994,8 @@ emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[5], XOP9_MAP_STR); emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[6], XOPA_MAP_STR); emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[7], THREEDNOW_MAP_STR); + emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[8], MAP5_STR); + emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[9], MAP6_STR); } void DisassemblerTables::emit(raw_ostream &o) const { diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h --- a/llvm/utils/TableGen/X86RecognizableInstr.h +++ b/llvm/utils/TableGen/X86RecognizableInstr.h @@ -130,7 +130,8 @@ }; enum { - OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7 + OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7, + T_MAP5 = 8, T_MAP6 = 9 }; enum { diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -752,6 +752,8 @@ case X86Local::XOP9: opcodeType = XOP9_MAP; break; case X86Local::XOPA: opcodeType = XOPA_MAP; break; case X86Local::ThreeDNow: opcodeType = THREEDNOW_MAP; break; + case X86Local::T_MAP5: opcodeType = MAP5; break; + case X86Local::T_MAP6: opcodeType = MAP6; break; } std::unique_ptr filter; @@ -901,10 +903,13 @@ TYPE("FR64X", TYPE_XMM) TYPE("f64mem", TYPE_M) TYPE("sdmem", TYPE_M) + TYPE("FR16X", TYPE_XMM) TYPE("FR32", TYPE_XMM) TYPE("FR32X", TYPE_XMM) TYPE("f32mem", TYPE_M) + TYPE("f16mem", TYPE_M) TYPE("ssmem", TYPE_M) + TYPE("shmem", TYPE_M) TYPE("RST", TYPE_ST) TYPE("RSTi", TYPE_ST) TYPE("i128mem", TYPE_M) @@ -1019,6 +1024,7 @@ ENCODING("FR128", ENCODING_IB) ENCODING("VR128", ENCODING_IB) ENCODING("VR256", ENCODING_IB) + ENCODING("FR16X", ENCODING_IB) ENCODING("FR32X", ENCODING_IB) ENCODING("FR64X", ENCODING_IB) ENCODING("VR128X", ENCODING_IB) @@ -1047,6 +1053,7 @@ ENCODING("FR32", ENCODING_RM) ENCODING("FR64X", ENCODING_RM) ENCODING("FR32X", ENCODING_RM) + ENCODING("FR16X", ENCODING_RM) ENCODING("VR64", ENCODING_RM) ENCODING("VR256", ENCODING_RM) ENCODING("VR256X", ENCODING_RM) @@ -1091,6 +1098,7 @@ ENCODING("VR128X", ENCODING_REG) ENCODING("FR64X", ENCODING_REG) ENCODING("FR32X", ENCODING_REG) + ENCODING("FR16X", ENCODING_REG) ENCODING("VR512", ENCODING_REG) ENCODING("VK1", ENCODING_REG) ENCODING("VK2", ENCODING_REG) @@ -1127,6 +1135,7 @@ ENCODING("FR64", ENCODING_VVVV) ENCODING("VR128", ENCODING_VVVV) ENCODING("VR256", ENCODING_VVVV) + ENCODING("FR16X", ENCODING_VVVV) ENCODING("FR32X", ENCODING_VVVV) ENCODING("FR64X", ENCODING_VVVV) ENCODING("VR128X", ENCODING_VVVV) @@ -1170,6 +1179,7 @@ ENCODING("i32mem", ENCODING_RM) ENCODING("i64mem", ENCODING_RM) ENCODING("i8mem", ENCODING_RM) + ENCODING("shmem", ENCODING_RM) ENCODING("ssmem", ENCODING_RM) ENCODING("sdmem", ENCODING_RM) ENCODING("f128mem", ENCODING_RM) @@ -1177,6 +1187,7 @@ ENCODING("f512mem", ENCODING_RM) ENCODING("f64mem", ENCODING_RM) ENCODING("f32mem", ENCODING_RM) + ENCODING("f16mem", ENCODING_RM) ENCODING("i128mem", ENCODING_RM) ENCODING("i256mem", ENCODING_RM) ENCODING("i512mem", ENCODING_RM)