diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -3553,6 +3553,8 @@
 
 .. option:: -mavx512f, -mno-avx512f
 
+.. option:: -mavx512fp16, -mno-avx512fp16
+
 .. option:: -mavx512ifma, -mno-avx512ifma
 
 .. option:: -mavx512pf, -mno-avx512pf
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -596,6 +596,7 @@
 * 64-bit ARM (AArch64)
 * AMDGPU
 * SPIR
+* X86 (Only available under feature AVX512-FP16)
 
 ``_Float16`` will be supported on more targets as they define ABIs for it.
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -133,7 +133,7 @@
 X86 Support in Clang
 --------------------
 
-- ...
+- Support for ``AVX512-FP16`` instructions has been added.
 
 Internal API Changes
 --------------------
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1849,6 +1849,10 @@
 TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "avx512vp2intersect,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
 
+// AVX512 fp16 intrinsics
+TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
+
 // generic select intrinsics
 TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
@@ -1859,6 +1863,9 @@
 TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "ncV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "ncV:256:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_selectph_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_selectph_256, "V16xUsV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_selectph_512, "V32xUiV32xV32x", "ncV:512:", "avx512fp16")
 TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2OiUcV2OiV2Oi", "ncV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4OiUcV4OiV4Oi", "ncV:256:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8OiUcV8OiV8Oi", "ncV:512:", "avx512f")
@@ -1868,6 +1875,7 @@
 TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "ncV:128:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "ncV:256:", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_selectsh_128, "V8xUcV8xV8x", "ncV:128:", "avx512fp16")
 TARGET_BUILTIN(__builtin_ia32_selectss_128, "V4fUcV4fV4f", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_selectsd_128, "V2dUcV2dV2d", "ncV:128:", "avx512f")
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4165,6 +4165,8 @@
 def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
 def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
 def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
+def mavx512fp16 : Flag<["-"], "mavx512fp16">, Group<m_x86_Features_Group>;
+def mno_avx512fp16 : Flag<["-"], "mno-avx512fp16">, Group<m_x86_Features_Group>;
 def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
 def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
 def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -92,6 +92,7 @@
   bool HasAVX512CD = false;
   bool HasAVX512VPOPCNTDQ = false;
   bool HasAVX512VNNI = false;
+  bool HasAVX512FP16 = false;
   bool HasAVX512BF16 = false;
   bool HasAVX512ER = false;
   bool HasAVX512PF = false;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -231,6 +231,9 @@
       HasAVX512BF16 = true;
     } else if (Feature == "+avx512er") {
       HasAVX512ER = true;
+    } else if (Feature == "+avx512fp16") {
+      HasAVX512FP16 = true;
+      HasFloat16 = true;
     } else if (Feature == "+avx512pf") {
       HasAVX512PF = true;
     } else if (Feature == "+avx512dq") {
@@ -668,6 +671,8 @@
     Builder.defineMacro("__AVX512BF16__");
   if (HasAVX512ER)
     Builder.defineMacro("__AVX512ER__");
+  if (HasAVX512FP16)
+    Builder.defineMacro("__AVX512FP16__");
   if (HasAVX512PF)
     Builder.defineMacro("__AVX512PF__");
   if (HasAVX512DQ)
@@ -856,6 +861,7 @@
       .Case("avx512vnni", true)
       .Case("avx512bf16", true)
       .Case("avx512er", true)
+      .Case("avx512fp16", true)
       .Case("avx512pf", true)
       .Case("avx512dq", true)
       .Case("avx512bitalg", true)
@@ -948,6 +954,7 @@
       .Case("avx512vnni", HasAVX512VNNI)
       .Case("avx512bf16", HasAVX512BF16)
       .Case("avx512er", HasAVX512ER)
+      .Case("avx512fp16", HasAVX512FP16)
       .Case("avx512pf", HasAVX512PF)
       .Case("avx512dq", HasAVX512DQ)
       .Case("avx512bitalg", HasAVX512BITALG)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -12671,6 +12671,7 @@
   case X86::BI__builtin_ia32_storeups512_mask:
     return EmitX86MaskedStore(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_storesh128_mask:
   case X86::BI__builtin_ia32_storess128_mask:
   case X86::BI__builtin_ia32_storesd128_mask:
     return EmitX86MaskedStore(*this, Ops, Align(1));
@@ -12806,6 +12807,7 @@
   case X86::BI__builtin_ia32_loaddqudi512_mask:
     return EmitX86MaskedLoad(*this, Ops, Align(1));
 
+  case X86::BI__builtin_ia32_loadsh128_mask:
   case X86::BI__builtin_ia32_loadss128_mask:
   case X86::BI__builtin_ia32_loadsd128_mask:
     return EmitX86MaskedLoad(*this, Ops, Align(1));
@@ -13685,6 +13687,9 @@
   case X86::BI__builtin_ia32_selectq_128:
   case X86::BI__builtin_ia32_selectq_256:
   case X86::BI__builtin_ia32_selectq_512:
+  case X86::BI__builtin_ia32_selectph_128:
+  case X86::BI__builtin_ia32_selectph_256:
+  case X86::BI__builtin_ia32_selectph_512:
   case X86::BI__builtin_ia32_selectps_128:
   case X86::BI__builtin_ia32_selectps_256:
   case X86::BI__builtin_ia32_selectps_512:
@@ -13692,6 +13697,7 @@
   case X86::BI__builtin_ia32_selectpd_256:
   case X86::BI__builtin_ia32_selectpd_512:
     return EmitX86Select(*this, Ops[0], Ops[1], Ops[2]);
+  case X86::BI__builtin_ia32_selectsh_128:
   case X86::BI__builtin_ia32_selectss_128:
   case X86::BI__builtin_ia32_selectsd_128: {
     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -2812,7 +2812,8 @@
       Hi = Integer;
     } else if (k >= BuiltinType::Bool && k <= BuiltinType::LongLong) {
       Current = Integer;
-    } else if (k == BuiltinType::Float || k == BuiltinType::Double) {
+    } else if (k == BuiltinType::Float || k == BuiltinType::Double ||
+               k == BuiltinType::Float16) {
       Current = SSE;
     } else if (k == BuiltinType::LongDouble) {
       const llvm::fltSemantics *LDF = &getTarget().getLongDoubleFormat();
@@ -2943,7 +2944,7 @@
         Current = Integer;
       else if (Size <= 128)
         Lo = Hi = Integer;
-    } else if (ET == getContext().FloatTy) {
+    } else if (ET->isFloat16Type() || ET == getContext().FloatTy) {
       Current = SSE;
     } else if (ET == getContext().DoubleTy) {
       Lo = Hi = SSE;
@@ -3396,27 +3397,76 @@
   return false;
 }
 
+/// ContainsHalfAtOffset - Return true if the specified LLVM IR type has a
+/// half member at the specified offset.  For example, {int,{half}} has a
+/// half at offset 4.  It is conservatively correct for this routine to return
+/// false.
+/// FIXME: Merge with ContainsFloatAtOffset
+static bool ContainsHalfAtOffset(llvm::Type *IRType, unsigned IROffset,
+                                 const llvm::DataLayout &TD) {
+  // Base case if we find a float.
+  if (IROffset == 0 && IRType->isHalfTy())
+    return true;
+
+  // If this is a struct, recurse into the field at the specified offset.
+  if (llvm::StructType *STy = dyn_cast<llvm::StructType>(IRType)) {
+    const llvm::StructLayout *SL = TD.getStructLayout(STy);
+    unsigned Elt = SL->getElementContainingOffset(IROffset);
+    IROffset -= SL->getElementOffset(Elt);
+    return ContainsHalfAtOffset(STy->getElementType(Elt), IROffset, TD);
+  }
+
+  // If this is an array, recurse into the field at the specified offset.
+  if (llvm::ArrayType *ATy = dyn_cast<llvm::ArrayType>(IRType)) {
+    llvm::Type *EltTy = ATy->getElementType();
+    unsigned EltSize = TD.getTypeAllocSize(EltTy);
+    IROffset -= IROffset / EltSize * EltSize;
+    return ContainsHalfAtOffset(EltTy, IROffset, TD);
+  }
+
+  return false;
+}
 
 /// GetSSETypeAtOffset - Return a type that will be passed by the backend in the
 /// low 8 bytes of an XMM register, corresponding to the SSE class.
 llvm::Type *X86_64ABIInfo::
 GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset,
                    QualType SourceTy, unsigned SourceOffset) const {
-  // The only three choices we have are either double, <2 x float>, or float. We
-  // pass as float if the last 4 bytes is just padding.  This happens for
-  // structs that contain 3 floats.
-  if (BitsContainNoUserData(SourceTy, SourceOffset*8+32,
-                            SourceOffset*8+64, getContext()))
-    return llvm::Type::getFloatTy(getVMContext());
+  // If the high 32 bits are not used, we have three choices. Single half,
+  // single float or two halfs.
+  if (BitsContainNoUserData(SourceTy, SourceOffset * 8 + 32,
+                            SourceOffset * 8 + 64, getContext())) {
+    if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()))
+      return llvm::Type::getFloatTy(getVMContext());
+    if (ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout()))
+      return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()),
+                                        2);
+
+    return llvm::Type::getHalfTy(getVMContext());
+  }
 
   // We want to pass as <2 x float> if the LLVM IR type contains a float at
-  // offset+0 and offset+4.  Walk the LLVM IR type to find out if this is the
+  // offset+0 and offset+4. Walk the LLVM IR type to find out if this is the
   // case.
   if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) &&
-      ContainsFloatAtOffset(IRType, IROffset+4, getDataLayout()))
+      ContainsFloatAtOffset(IRType, IROffset + 4, getDataLayout()))
     return llvm::FixedVectorType::get(llvm::Type::getFloatTy(getVMContext()),
                                       2);
 
+  // We want to pass as <4 x half> if the LLVM IR type contains a half at
+  // offset+0, +2, +4. Walk the LLVM IR type to find out if this is the case.
+  if (ContainsHalfAtOffset(IRType, IROffset, getDataLayout()) &&
+      ContainsHalfAtOffset(IRType, IROffset + 2, getDataLayout()) &&
+      ContainsHalfAtOffset(IRType, IROffset + 4, getDataLayout()))
+    return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 4);
+
+  // We want to pass as <4 x half> if the LLVM IR type contains a mix of float
+  // and half.
+  // FIXME: Do we have a better representation for the mixed type?
+  if (ContainsFloatAtOffset(IRType, IROffset, getDataLayout()) ||
+      ContainsFloatAtOffset(IRType, IROffset + 4, getDataLayout()))
+    return llvm::FixedVectorType::get(llvm::Type::getHalfTy(getVMContext()), 4);
+
   return llvm::Type::getDoubleTy(getVMContext());
 }
 
@@ -3521,11 +3571,11 @@
   // struct.
   if (HiStart != 8) {
     // There are usually two sorts of types the ABI generation code can produce
-    // for the low part of a pair that aren't 8 bytes in size: float or
+    // for the low part of a pair that aren't 8 bytes in size: half, float or
     // i8/i16/i32.  This can also include pointers when they are 32-bit (X32 and
     // NaCl).
     // Promote these to a larger type.
-    if (Lo->isFloatTy())
+    if (Lo->isHalfTy() || Lo->isFloatTy())
       Lo = llvm::Type::getDoubleTy(Lo->getContext());
     else {
       assert((Lo->isIntegerTy() || Lo->isPointerTy())
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -17,6 +17,7 @@
   avx512dqintrin.h
   avx512erintrin.h
   avx512fintrin.h
+  avx512fp16intrin.h
   avx512ifmaintrin.h
   avx512ifmavlintrin.h
   avx512pfintrin.h
@@ -28,6 +29,7 @@
   avx512vlbwintrin.h
   avx512vlcdintrin.h
   avx512vldqintrin.h
+  avx512vlfp16intrin.h
   avx512vlintrin.h
   avx512vp2intersectintrin.h
   avx512vlvp2intersectintrin.h
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -0,0 +1,444 @@
+/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16INTRIN_H
+#define __AVX512FP16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
+typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+                 __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"),     \
+                 __min_vector_width__(128)))
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
+  return __a[0];
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
+  return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
+  return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
+  return (__m256h)__builtin_ia32_undef256();
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
+  return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                   0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
+  return (__m128h)__builtin_ia32_undef128();
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
+  return (__m512h)__builtin_ia32_undef512();
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
+  return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
+                            __h, __h, __h, __h, __h, __h, __h, __h,
+                            __h, __h, __h, __h, __h, __h, __h, __h,
+                            __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
+              _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
+              _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
+              _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
+              _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
+  return (__m512h)(__v32hf){__h1,  __h2,  __h3,  __h4,  __h5,  __h6,  __h7,
+                            __h8,  __h9,  __h10, __h11, __h12, __h13, __h14,
+                            __h15, __h16, __h17, __h18, __h19, __h20, __h21,
+                            __h22, __h23, __h24, __h25, __h26, __h27, __h28,
+                            __h29, __h30, __h31, __h32};
+}
+
+#define _mm512_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, __h9,   \
+                       __h10, __h11, __h12, __h13, __h14, __h15, __h16, __h17, \
+                       __h18, __h19, __h20, __h21, __h22, __h23, __h24, __h25, \
+                       __h26, __h27, __h28, __h29, __h30, __h31, __h32)        \
+  _mm512_set_ph((__h32), (__h31), (__h30), (__h29), (__h28), (__h27), (__h26), \
+                (__h25), (__h24), (__h23), (__h22), (__h21), (__h20), (__h19), \
+                (__h18), (__h17), (__h16), (__h15), (__h14), (__h13), (__h12), \
+                (__h11), (__h10), (__h9), (__h8), (__h7), (__h6), (__h5),      \
+                (__h4), (__h3), (__h2), (__h1))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
+  return (__m128)__a;
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
+  return (__m256)__a;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
+  return (__m512)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
+  return (__m128d)__a;
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
+  return (__m256d)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
+  return (__m512d)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
+  return (__m128i)__a;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_castph_si256(__m256h __a) {
+  return (__m256i)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castph_si512(__m512h __a) {
+  return (__m512i)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
+  return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
+  return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
+  return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
+  return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
+  return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
+  return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
+  return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_castsi256_ph(__m256i __a) {
+  return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castsi512_ph(__m512i __a) {
+  return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_castph256_ph128(__m256h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_castph512_ph128(__m512h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_castph512_ph256(__m512h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_castph128_ph256(__m128h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castph128_ph512(__m128h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castph256_ph512(__m256h __a) {
+  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+                                 -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+/// Constructs a 256-bit floating-point vector of [16 x half] from a
+///    128-bit floating-point vector of [8 x half]. The lower 128 bits
+///    contain the value of the source vector. The upper 384 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x half].
+/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_zextph128_ph256(__m128h __a) {
+  return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
+                                 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit floating-point vector of [32 x half] from a
+///    128-bit floating-point vector of [8 x half]. The lower 128 bits
+///    contain the value of the source vector. The upper 384 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x half].
+/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
+///    contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_zextph128_ph512(__m128h __a) {
+  return __builtin_shufflevector(
+      __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+      13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit floating-point vector of [32 x half] from a
+///    256-bit floating-point vector of [16 x half]. The lower 256 bits
+///    contain the value of the source vector. The upper 256 bits are set
+///    to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 256-bit vector of [16 x half].
+/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
+///    contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_zextph256_ph512(__m256h __a) {
+  return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
+                                 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                                 29, 30, 31);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
+  return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
+}
+
+// loads with vmovsh:
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
+  struct __mm_load_sh_struct {
+    _Float16 __u;
+  } __attribute__((__packed__, __may_alias__));
+  _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
+  return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
+  __m128h src = (__v8hf)__builtin_shufflevector(
+      (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
+
+  return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
+  return (__m128h)__builtin_ia32_loadsh128_mask(
+      (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_load_ph(void const *__p) {
+  return *(const __m512h *)__p;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_load_ph(void const *__p) {
+  return *(const __m256h *)__p;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
+  return *(const __m128h *)__p;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_loadu_ph(void const *__p) {
+  struct __loadu_ph {
+    __m512h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_ph *)__p)->__v;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_loadu_ph(void const *__p) {
+  struct __loadu_ph {
+    __m256h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_ph *)__p)->__v;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
+  struct __loadu_ph {
+    __m128h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  return ((const struct __loadu_ph *)__p)->__v;
+}
+
+// stores with vmovsh:
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
+                                                          __m128h __a) {
+  struct __mm_store_sh_struct {
+    _Float16 __u;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
+                                                               __mmask8 __U,
+                                                               __m128h __A) {
+  __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
+                                                             __m512h __A) {
+  *(__m512h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
+                                                             __m256h __A) {
+  *(__m256h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
+                                                          __m128h __A) {
+  *(__m128h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
+                                                              __m512h __A) {
+  struct __storeu_ph {
+    __m512h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
+                                                              __m256h __A) {
+  struct __storeu_ph {
+    __m256h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
+                                                           __m128h __A) {
+  struct __storeu_ph {
+    __m128h_u __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+// moves with vmovsh:
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
+                                                            __m128h __b) {
+  __a[0] = __b[0];
+  return __a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
+                                                                 __mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
+                                                                  __m128h __A,
+                                                                  __m128h __B) {
+  return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
+                                     _mm_setzero_ph());
+}
+
+// vmovw:
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
+  return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
+  __v8hi __b = (__v8hi)__a;
+  return __b[0];
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
+                                              (__v32hf)__A);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
+  return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+                                                 (__v32hi)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -0,0 +1,119 @@
+/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLFP16INTRIN_H
+#define __AVX512VLFP16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16, avx512vl"),                           \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512fp16, avx512vl"),                           \
+                 __min_vector_width__(128)))
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
+  return __a[0];
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
+  return __a[0];
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
+  return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
+  return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
+  return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
+                            __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128
+_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
+  return (__m128h)(__v8hf){__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256
+_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+              _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+              _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+              _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
+  return (__m256h)(__v16hf){__h1,  __h2,  __h3,  __h4,  __h5,  __h6,
+                            __h7,  __h8,  __h9,  __h10, __h11, __h12,
+                            __h13, __h14, __h15, __h16};
+}
+
+#define _mm_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8)            \
+  _mm_set_ph((__h8), (__h7), (__h6), (__h5), (__h4), (__h3), (__h2), (__h1))
+
+#define _mm256_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8, __h9,   \
+                       __h10, __h11, __h12, __h13, __h14, __h15, __h16)        \
+  _mm256_set_ph((__h16), (__h15), (__h14), (__h13), (__h12), (__h11), (__h10), \
+                (__h9), (__h8), (__h7), (__h6), (__h5), (__h4), (__h3),        \
+                (__h2), (__h1))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
+  return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
+  return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
+                                                                  __m128h __A,
+                                                                  __m128h __W) {
+  return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
+                                              (__v8hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
+  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
+                                              (__v16hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
+  return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
+                                                 (__v8hi)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
+  return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
+                                                 (__v16hi)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_permutexvar_ph(__m128i __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -195,6 +195,7 @@
 #define bit_PCONFIG       0x00040000
 #define bit_IBT           0x00100000
 #define bit_AMXBF16       0x00400000
+#define bit_AVX512FP16    0x00800000
 #define bit_AMXTILE       0x01000000
 #define bit_AMXINT8       0x02000000
 
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -210,6 +210,20 @@
 #include <avx512pfintrin.h>
 #endif
 
+/*
+ * FIXME: _Float16 type is legal only when HW support float16 operation.
+ * We use __AVX512FP16__ to identify if float16 is supported or not, so
+ * when float16 is not supported, the related header is not included.
+ *
+ */
+#if defined(__AVX512FP16__)
+#include <avx512fp16intrin.h>
+#endif
+
+#if defined(__AVX512FP16__) && defined(__AVX512VL__)
+#include <avx512vlfp16intrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512BF16__)
 #include <avx512bf16intrin.h>
diff --git a/clang/test/CodeGen/X86/avx512fp16-abi.c b/clang/test/CodeGen/X86/avx512fp16-abi.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512fp16-abi.c
@@ -0,0 +1,149 @@
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm  -target-feature +avx512fp16 < %s | FileCheck %s --check-prefixes=CHECK
+
+struct half1 {
+  _Float16 a;
+};
+
+struct half1 h1(_Float16 a) {
+  // CHECK: define{{.*}}half @h1
+  struct half1 x;
+  x.a = a;
+  return x;
+}
+
+struct half2 {
+  _Float16 a;
+  _Float16 b;
+};
+
+struct half2 h2(_Float16 a, _Float16 b) {
+  // CHECK: define{{.*}}<2 x half> @h2
+  struct half2 x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct half3 {
+  _Float16 a;
+  _Float16 b;
+  _Float16 c;
+};
+
+struct half3 h3(_Float16 a, _Float16 b, _Float16 c) {
+  // CHECK: define{{.*}}<4 x half> @h3
+  struct half3 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct half4 {
+  _Float16 a;
+  _Float16 b;
+  _Float16 c;
+  _Float16 d;
+};
+
+struct half4 h4(_Float16 a, _Float16 b, _Float16 c, _Float16 d) {
+  // CHECK: define{{.*}}<4 x half> @h4
+  struct half4 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  return x;
+}
+
+struct floathalf {
+  float a;
+  _Float16 b;
+};
+
+struct floathalf fh(float a, _Float16 b) {
+  // CHECK: define{{.*}}<4 x half> @fh
+  struct floathalf x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct floathalf2 {
+  float a;
+  _Float16 b;
+  _Float16 c;
+};
+
+struct floathalf2 fh2(float a, _Float16 b, _Float16 c) {
+  // CHECK: define{{.*}}<4 x half> @fh2
+  struct floathalf2 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct halffloat {
+  _Float16 a;
+  float b;
+};
+
+struct halffloat hf(_Float16 a, float b) {
+  // CHECK: define{{.*}}<4 x half> @hf
+  struct halffloat x;
+  x.a = a;
+  x.b = b;
+  return x;
+}
+
+struct half2float {
+  _Float16 a;
+  _Float16 b;
+  float c;
+};
+
+struct half2float h2f(_Float16 a, _Float16 b, float c) {
+  // CHECK: define{{.*}}<4 x half> @h2f
+  struct half2float x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  return x;
+}
+
+struct floathalf3 {
+  float a;
+  _Float16 b;
+  _Float16 c;
+  _Float16 d;
+};
+
+struct floathalf3 fh3(float a, _Float16 b, _Float16 c, _Float16 d) {
+  // CHECK: define{{.*}}{ <4 x half>, half } @fh3
+  struct floathalf3 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  return x;
+}
+
+struct half5 {
+  _Float16 a;
+  _Float16 b;
+  _Float16 c;
+  _Float16 d;
+  _Float16 e;
+};
+
+struct half5 h5(_Float16 a, _Float16 b, _Float16 c, _Float16 d, _Float16 e) {
+  // CHECK: define{{.*}}{ <4 x half>, half } @h5
+  struct half5 x;
+  x.a = a;
+  x.b = b;
+  x.c = c;
+  x.d = d;
+  x.e = e;
+  return x;
+}
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -0,0 +1,526 @@
+// RUN: %clang_cc1 -ffreestanding -flax-vector-conversions=none %s -triple=x86_64-unknown-unknown -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+_Float16 test_mm512_cvtsh_h(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_cvtsh_h
+  // CHECK: extractelement <32 x half> %{{.*}}, i32 0
+  return _mm512_cvtsh_h(__A);
+}
+
+__m128h test_mm_setzero_ph() {
+  // CHECK-LABEL: @test_mm_setzero_ph
+  // CHECK: zeroinitializer
+  return _mm_setzero_ph();
+}
+
+__m256h test_mm256_setzero_ph() {
+  // CHECK-LABEL: @test_mm256_setzero_ph
+  // CHECK: zeroinitializer
+  return _mm256_setzero_ph();
+}
+
+__m256h test_mm256_undefined_ph() {
+  // CHECK-LABEL: @test_mm256_undefined_ph
+  // CHECK: ret <16 x half> zeroinitializer
+  return _mm256_undefined_ph();
+}
+
+__m512h test_mm512_setzero_ph() {
+  // CHECK-LABEL: @test_mm512_setzero_ph
+  // CHECK: zeroinitializer
+  return _mm512_setzero_ph();
+}
+
+__m128h test_mm_undefined_ph() {
+  // CHECK-LABEL: @test_mm_undefined_ph
+  // CHECK: ret <8 x half> zeroinitializer
+  return _mm_undefined_ph();
+}
+
+__m512h test_mm512_undefined_ph() {
+  // CHECK-LABEL: @test_mm512_undefined_ph
+  // CHECK: ret <32 x half> zeroinitializer
+  return _mm512_undefined_ph();
+}
+
+__m512h test_mm512_set1_ph(_Float16 h) {
+  // CHECK-LABEL: @test_mm512_set1_ph
+  // CHECK: insertelement <32 x half> {{.*}}, i32 0
+  // CHECK: insertelement <32 x half> {{.*}}, i32 1
+  // CHECK: insertelement <32 x half> {{.*}}, i32 2
+  // CHECK: insertelement <32 x half> {{.*}}, i32 3
+  // CHECK: insertelement <32 x half> {{.*}}, i32 4
+  // CHECK: insertelement <32 x half> {{.*}}, i32 5
+  // CHECK: insertelement <32 x half> {{.*}}, i32 6
+  // CHECK: insertelement <32 x half> {{.*}}, i32 7
+  // CHECK: insertelement <32 x half> {{.*}}, i32 8
+  // CHECK: insertelement <32 x half> {{.*}}, i32 9
+  // CHECK: insertelement <32 x half> {{.*}}, i32 10
+  // CHECK: insertelement <32 x half> {{.*}}, i32 11
+  // CHECK: insertelement <32 x half> {{.*}}, i32 12
+  // CHECK: insertelement <32 x half> {{.*}}, i32 13
+  // CHECK: insertelement <32 x half> {{.*}}, i32 14
+  // CHECK: insertelement <32 x half> {{.*}}, i32 15
+  // CHECK: insertelement <32 x half> {{.*}}, i32 16
+  // CHECK: insertelement <32 x half> {{.*}}, i32 17
+  // CHECK: insertelement <32 x half> {{.*}}, i32 18
+  // CHECK: insertelement <32 x half> {{.*}}, i32 19
+  // CHECK: insertelement <32 x half> {{.*}}, i32 20
+  // CHECK: insertelement <32 x half> {{.*}}, i32 21
+  // CHECK: insertelement <32 x half> {{.*}}, i32 22
+  // CHECK: insertelement <32 x half> {{.*}}, i32 23
+  // CHECK: insertelement <32 x half> {{.*}}, i32 24
+  // CHECK: insertelement <32 x half> {{.*}}, i32 25
+  // CHECK: insertelement <32 x half> {{.*}}, i32 26
+  // CHECK: insertelement <32 x half> {{.*}}, i32 27
+  // CHECK: insertelement <32 x half> {{.*}}, i32 28
+  // CHECK: insertelement <32 x half> {{.*}}, i32 29
+  // CHECK: insertelement <32 x half> {{.*}}, i32 30
+  // CHECK: insertelement <32 x half> {{.*}}, i32 31
+  return _mm512_set1_ph(h);
+}
+
+__m512h test_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                          _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+                          _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+                          _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
+                          _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
+                          _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
+                          _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
+                          _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
+  // CHECK-LABEL: @test_mm512_set_ph
+  // CHECK: insertelement <32 x half> {{.*}}, i32 0
+  // CHECK: insertelement <32 x half> {{.*}}, i32 1
+  // CHECK: insertelement <32 x half> {{.*}}, i32 2
+  // CHECK: insertelement <32 x half> {{.*}}, i32 3
+  // CHECK: insertelement <32 x half> {{.*}}, i32 4
+  // CHECK: insertelement <32 x half> {{.*}}, i32 5
+  // CHECK: insertelement <32 x half> {{.*}}, i32 6
+  // CHECK: insertelement <32 x half> {{.*}}, i32 7
+  // CHECK: insertelement <32 x half> {{.*}}, i32 8
+  // CHECK: insertelement <32 x half> {{.*}}, i32 9
+  // CHECK: insertelement <32 x half> {{.*}}, i32 10
+  // CHECK: insertelement <32 x half> {{.*}}, i32 11
+  // CHECK: insertelement <32 x half> {{.*}}, i32 12
+  // CHECK: insertelement <32 x half> {{.*}}, i32 13
+  // CHECK: insertelement <32 x half> {{.*}}, i32 14
+  // CHECK: insertelement <32 x half> {{.*}}, i32 15
+  // CHECK: insertelement <32 x half> {{.*}}, i32 16
+  // CHECK: insertelement <32 x half> {{.*}}, i32 17
+  // CHECK: insertelement <32 x half> {{.*}}, i32 18
+  // CHECK: insertelement <32 x half> {{.*}}, i32 19
+  // CHECK: insertelement <32 x half> {{.*}}, i32 20
+  // CHECK: insertelement <32 x half> {{.*}}, i32 21
+  // CHECK: insertelement <32 x half> {{.*}}, i32 22
+  // CHECK: insertelement <32 x half> {{.*}}, i32 23
+  // CHECK: insertelement <32 x half> {{.*}}, i32 24
+  // CHECK: insertelement <32 x half> {{.*}}, i32 25
+  // CHECK: insertelement <32 x half> {{.*}}, i32 26
+  // CHECK: insertelement <32 x half> {{.*}}, i32 27
+  // CHECK: insertelement <32 x half> {{.*}}, i32 28
+  // CHECK: insertelement <32 x half> {{.*}}, i32 29
+  // CHECK: insertelement <32 x half> {{.*}}, i32 30
+  // CHECK: insertelement <32 x half> {{.*}}, i32 31
+  return _mm512_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
+                       __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16,
+                       __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24,
+                       __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32);
+}
+
+__m512h test_mm512_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+                           _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+                           _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
+                           _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
+                           _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
+                           _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
+                           _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
+  // CHECK-LABEL: @test_mm512_setr_ph
+  // CHECK: insertelement <32 x half> {{.*}}, i32 0
+  // CHECK: insertelement <32 x half> {{.*}}, i32 1
+  // CHECK: insertelement <32 x half> {{.*}}, i32 2
+  // CHECK: insertelement <32 x half> {{.*}}, i32 3
+  // CHECK: insertelement <32 x half> {{.*}}, i32 4
+  // CHECK: insertelement <32 x half> {{.*}}, i32 5
+  // CHECK: insertelement <32 x half> {{.*}}, i32 6
+  // CHECK: insertelement <32 x half> {{.*}}, i32 7
+  // CHECK: insertelement <32 x half> {{.*}}, i32 8
+  // CHECK: insertelement <32 x half> {{.*}}, i32 9
+  // CHECK: insertelement <32 x half> {{.*}}, i32 10
+  // CHECK: insertelement <32 x half> {{.*}}, i32 11
+  // CHECK: insertelement <32 x half> {{.*}}, i32 12
+  // CHECK: insertelement <32 x half> {{.*}}, i32 13
+  // CHECK: insertelement <32 x half> {{.*}}, i32 14
+  // CHECK: insertelement <32 x half> {{.*}}, i32 15
+  // CHECK: insertelement <32 x half> {{.*}}, i32 16
+  // CHECK: insertelement <32 x half> {{.*}}, i32 17
+  // CHECK: insertelement <32 x half> {{.*}}, i32 18
+  // CHECK: insertelement <32 x half> {{.*}}, i32 19
+  // CHECK: insertelement <32 x half> {{.*}}, i32 20
+  // CHECK: insertelement <32 x half> {{.*}}, i32 21
+  // CHECK: insertelement <32 x half> {{.*}}, i32 22
+  // CHECK: insertelement <32 x half> {{.*}}, i32 23
+  // CHECK: insertelement <32 x half> {{.*}}, i32 24
+  // CHECK: insertelement <32 x half> {{.*}}, i32 25
+  // CHECK: insertelement <32 x half> {{.*}}, i32 26
+  // CHECK: insertelement <32 x half> {{.*}}, i32 27
+  // CHECK: insertelement <32 x half> {{.*}}, i32 28
+  // CHECK: insertelement <32 x half> {{.*}}, i32 29
+  // CHECK: insertelement <32 x half> {{.*}}, i32 30
+  // CHECK: insertelement <32 x half> {{.*}}, i32 31
+  return _mm512_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
+                        __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16,
+                        __h17, __h18, __h19, __h20, __h21, __h22, __h23, __h24,
+                        __h25, __h26, __h27, __h28, __h29, __h30, __h31, __h32);
+}
+
+__m128 test_mm_castph_ps(__m128h A) {
+  // CHECK-LABEL: test_mm_castph_ps
+  // CHECK: bitcast <8 x half> %{{.*}} to <4 x float>
+  return _mm_castph_ps(A);
+}
+
+__m256 test_mm256_castph_ps(__m256h A) {
+  // CHECK-LABEL: test_mm256_castph_ps
+  // CHECK: bitcast <16 x half> %{{.*}} to <8 x float>
+  return _mm256_castph_ps(A);
+}
+
+__m512 test_mm512_castph_ps(__m512h A) {
+  // CHECK-LABEL: test_mm512_castph_ps
+  // CHECK: bitcast <32 x half> %{{.*}} to <16 x float>
+  return _mm512_castph_ps(A);
+}
+
+__m128d test_mm_castph_pd(__m128h A) {
+  // CHECK-LABEL: test_mm_castph_pd
+  // CHECK: bitcast <8 x half> %{{.*}} to <2 x double>
+  return _mm_castph_pd(A);
+}
+
+__m256d test_mm256_castph_pd(__m256h A) {
+  // CHECK-LABEL: test_mm256_castph_pd
+  // CHECK: bitcast <16 x half> %{{.*}} to <4 x double>
+  return _mm256_castph_pd(A);
+}
+
+__m512d test_mm512_castph_pd(__m512h A) {
+  // CHECK-LABEL: test_mm512_castph_pd
+  // CHECK: bitcast <32 x half> %{{.*}} to <8 x double>
+  return _mm512_castph_pd(A);
+}
+
+__m128i test_mm_castph_si128(__m128h A) {
+  // CHECK-LABEL: test_mm_castph_si128
+  // CHECK: bitcast <8 x half> %{{.*}} to <2 x i64>
+  return _mm_castph_si128(A);
+}
+
+__m256i test_mm256_castph_si256(__m256h A) {
+  // CHECK-LABEL: test_mm256_castph_si256
+  // CHECK: bitcast <16 x half> %{{.*}} to <4 x i64>
+  return _mm256_castph_si256(A);
+}
+
+__m512i test_mm512_castph_si512(__m512h A) {
+  // CHECK-LABEL: test_mm512_castph_si512
+  // CHECK: bitcast <32 x half> %{{.*}} to <8 x i64>
+  return _mm512_castph_si512(A);
+}
+
+__m128h test_mm_castps_ph(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_ph
+  // CHECK: bitcast <4 x float> %{{.*}} to <8 x half>
+  return _mm_castps_ph(A);
+}
+
+__m256h test_mm256_castps_ph(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_ph
+  // CHECK: bitcast <8 x float> %{{.*}} to <16 x half>
+  return _mm256_castps_ph(A);
+}
+
+__m512h test_mm512_castps_ph(__m512 A) {
+  // CHECK-LABEL: test_mm512_castps_ph
+  // CHECK: bitcast <16 x float> %{{.*}} to <32 x half>
+  return _mm512_castps_ph(A);
+}
+
+__m128h test_mm_castpd_ph(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_ph
+  // CHECK: bitcast <2 x double> %{{.*}} to <8 x half>
+  return _mm_castpd_ph(A);
+}
+
+__m256h test_mm256_castpd_ph(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_ph
+  // CHECK: bitcast <4 x double> %{{.*}} to <16 x half>
+  return _mm256_castpd_ph(A);
+}
+
+__m512h test_mm512_castpd_ph(__m512d A) {
+  // CHECK-LABEL: test_mm512_castpd_ph
+  // CHECK: bitcast <8 x double> %{{.*}} to <32 x half>
+  return _mm512_castpd_ph(A);
+}
+
+__m128h test_mm_castsi128_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_ph
+  // CHECK: bitcast <2 x i64> %{{.*}} to <8 x half>
+  return _mm_castsi128_ph(A);
+}
+
+__m256h test_mm256_castsi256_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_ph
+  // CHECK: bitcast <4 x i64> %{{.*}} to <16 x half>
+  return _mm256_castsi256_ph(A);
+}
+
+__m512h test_mm512_castsi512_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_castsi512_ph
+  // CHECK: bitcast <8 x i64> %{{.*}} to <32 x half>
+  return _mm512_castsi512_ph(A);
+}
+
+__m128h test_mm256_castph256_ph128(__m256h __a) {
+  // CHECK-LABEL: test_mm256_castph256_ph128
+  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_castph256_ph128(__a);
+}
+
+__m128h test_mm512_castph512_ph128(__m512h __a) {
+  // CHECK-LABEL: test_mm512_castph512_ph128
+  // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_castph512_ph128(__a);
+}
+
+__m256h test_mm512_castph512_ph256(__m512h __a) {
+  // CHECK-LABEL: test_mm512_castph512_ph256
+  // CHECK: shufflevector <32 x half> %{{.*}}, <32 x half> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_castph512_ph256(__a);
+}
+
+__m256h test_mm256_castph128_ph256(__m128h __a) {
+  // CHECK-LABEL: test_mm256_castph128_ph256
+  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm256_castph128_ph256(__a);
+}
+
+__m512h test_mm512_castph128_ph512(__m128h __a) {
+  // CHECK-LABEL: test_mm512_castph128_ph512
+  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castph128_ph512(__a);
+}
+
+__m512h test_mm512_castph256_ph512(__m256h __a) {
+  // CHECK-LABEL: test_mm512_castph256_ph512
+  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castph256_ph512(__a);
+}
+
+__m256h test_mm256_zextph128_ph256(__m128h __a) {
+  // CHECK-LABEL: test_mm256_zextph128_ph256
+  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_zextph128_ph256(__a);
+}
+
+__m512h test_mm512_zextph128_ph512(__m128h __a) {
+  // CHECK-LABEL: test_mm512_zextph128_ph512
+  // CHECK: shufflevector <8 x half> %{{.*}}, <8 x half> {{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm512_zextph128_ph512(__a);
+}
+
+__m512h test_mm512_zextph256_ph512(__m256h __a) {
+  // CHECK-LABEL: test_mm512_zextph256_ph512
+  // CHECK: shufflevector <16 x half> %{{.*}}, <16 x half> {{.*}}, <32 x i32>
+  return _mm512_zextph256_ph512(__a);
+}
+
+__m512h test_mm512_abs_ph(__m512h a) {
+  // CHECK-LABEL: @test_mm512_abs_ph
+  // CHECK: and <16 x i32>
+  return _mm512_abs_ph(a);
+}
+
+// VMOVSH
+
+__m128h test_mm_load_sh(void const *A) {
+  // CHECK-LABEL: test_mm_load_sh
+  // CHECK: load half, half* %{{.*}}, align 1{{$}}
+  return _mm_load_sh(A);
+}
+
+__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_mask_load_sh
+  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_mask_load_sh(__A, __U, __W);
+}
+
+__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_maskz_load_sh
+  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_maskz_load_sh(__U, __W);
+}
+
+__m512h test_mm512_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm512_load_ph
+  // CHECK: load <32 x half>, <32 x half>* %{{.*}}, align 64
+  return _mm512_load_ph(p);
+}
+
+__m256h test_mm256_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm256_load_ph
+  // CHECK: load <16 x half>, <16 x half>* %{{.*}}, align 32
+  return _mm256_load_ph(p);
+}
+
+__m128h test_mm_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm_load_ph
+  // CHECK: load <8 x half>, <8 x half>* %{{.*}}, align 16
+  return _mm_load_ph(p);
+}
+
+__m512h test_mm512_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm512_loadu_ph
+  // CHECK: load <32 x half>, <32 x half>* {{.*}}, align 1{{$}}
+  return _mm512_loadu_ph(p);
+}
+
+__m256h test_mm256_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm256_loadu_ph
+  // CHECK: load <16 x half>, <16 x half>* {{.*}}, align 1{{$}}
+  return _mm256_loadu_ph(p);
+}
+
+__m128h test_mm_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm_loadu_ph
+  // CHECK: load <8 x half>, <8 x half>* {{.*}}, align 1{{$}}
+  return _mm_loadu_ph(p);
+}
+
+void test_mm_store_sh(void *A, __m128h B) {
+  // CHECK-LABEL: test_mm_store_sh
+  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: store half %{{.*}}, half* %{{.*}}, align 1{{$}}
+  _mm_store_sh(A, B);
+}
+
+void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_store_sh
+  // CHECK: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %{{.*}}, <8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  _mm_mask_store_sh(__P, __U, __A);
+}
+
+void test_mm512_store_ph(void *p, __m512h a) {
+  // CHECK-LABEL: @test_mm512_store_ph
+  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 64
+  _mm512_store_ph(p, a);
+}
+
+void test_mm256_store_ph(void *p, __m256h a) {
+  // CHECK-LABEL: @test_mm256_store_ph
+  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 32
+  _mm256_store_ph(p, a);
+}
+
+void test_mm_store_ph(void *p, __m128h a) {
+  // CHECK-LABEL: @test_mm_store_ph
+  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 16
+  _mm_store_ph(p, a);
+}
+
+void test_mm512_storeu_ph(void *p, __m512h a) {
+  // CHECK-LABEL: @test_mm512_storeu_ph
+  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm512_storeu_ph(p, a);
+}
+
+void test_mm256_storeu_ph(void *p, __m256h a) {
+  // CHECK-LABEL: @test_mm256_storeu_ph
+  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm256_storeu_ph(p, a);
+}
+
+void test_mm_storeu_ph(void *p, __m128h a) {
+  // CHECK-LABEL: @test_mm_storeu_ph
+  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_ph(p, a);
+}
+
+__m128h test_mm_move_sh(__m128h A, __m128h B) {
+  // CHECK-LABEL: test_mm_move_sh
+  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_move_sh(A, B);
+}
+
+__m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_move_sh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
+  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
+  return _mm_mask_move_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_move_sh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
+  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
+  return _mm_maskz_move_sh(__U, __A, __B);
+}
+
+short test_mm_cvtsi128_si16(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtsi128_si16
+  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
+  return _mm_cvtsi128_si16(A);
+}
+
+__m128i test_mm_cvtsi16_si128(short A) {
+  // CHECK-LABEL: test_mm_cvtsi16_si128
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
+  return _mm_cvtsi16_si128(A);
+}
+
+__m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
+  // CHECK-LABEL: @test_mm512_mask_blend_ph
+  // CHECK:  %{{.*}} = bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK:  %{{.*}} = select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_blend_ph(__U, __A, __W);
+}
+
+__m512h test_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_permutex2var_ph
+  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_permutex2var_ph(__A, __I, __B);
+}
+
+__m512h test_mm512_permutexvar_epi16(__m512i __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi16
+  // CHECK:  %{{.*}} = bitcast <32 x half> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x i64> %{{.*}} to <32 x i16>
+  // CHECK:  %{{.*}} = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %{{.*}}, <32 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_permutexvar_ph(__A, __B);
+}
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -0,0 +1,204 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-unknown -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+_Float16 test_mm_cvtsh_h(__m128h __A) {
+  // CHECK-LABEL: @test_mm_cvtsh_h
+  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
+  return _mm_cvtsh_h(__A);
+}
+
+_Float16 test_mm256_cvtsh_h(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_cvtsh_h
+  // CHECK: extractelement <16 x half> %{{.*}}, i32 0
+  return _mm256_cvtsh_h(__A);
+}
+
+__m128h test_mm_set_sh(_Float16 __h) {
+  // CHECK-LABEL: @test_mm_set_sh
+  // CHECK: insertelement <8 x half> {{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 1
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 2
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 3
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 4
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 5
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 6
+  // CHECK: insertelement <8 x half> %{{.*}}, half 0xH0000, i32 7
+  return _mm_set_sh(__h);
+}
+
+__m128h test_mm_set1_ph(_Float16 h) {
+  // CHECK-LABEL: @test_mm_set1_ph
+  // CHECK: insertelement <8 x half> {{.*}}, i32 0
+  // CHECK: insertelement <8 x half> {{.*}}, i32 1
+  // CHECK: insertelement <8 x half> {{.*}}, i32 2
+  // CHECK: insertelement <8 x half> {{.*}}, i32 3
+  // CHECK: insertelement <8 x half> {{.*}}, i32 4
+  // CHECK: insertelement <8 x half> {{.*}}, i32 5
+  // CHECK: insertelement <8 x half> {{.*}}, i32 6
+  // CHECK: insertelement <8 x half> {{.*}}, i32 7
+  return _mm_set1_ph(h);
+}
+
+__m256h test_mm256_set1_ph(_Float16 h) {
+  // CHECK-LABEL: @test_mm256_set1_ph
+  // CHECK: insertelement <16 x half> {{.*}}, i32 0
+  // CHECK: insertelement <16 x half> {{.*}}, i32 1
+  // CHECK: insertelement <16 x half> {{.*}}, i32 2
+  // CHECK: insertelement <16 x half> {{.*}}, i32 3
+  // CHECK: insertelement <16 x half> {{.*}}, i32 4
+  // CHECK: insertelement <16 x half> {{.*}}, i32 5
+  // CHECK: insertelement <16 x half> {{.*}}, i32 6
+  // CHECK: insertelement <16 x half> {{.*}}, i32 7
+  // CHECK: insertelement <16 x half> {{.*}}, i32 8
+  // CHECK: insertelement <16 x half> {{.*}}, i32 9
+  // CHECK: insertelement <16 x half> {{.*}}, i32 10
+  // CHECK: insertelement <16 x half> {{.*}}, i32 11
+  // CHECK: insertelement <16 x half> {{.*}}, i32 12
+  // CHECK: insertelement <16 x half> {{.*}}, i32 13
+  // CHECK: insertelement <16 x half> {{.*}}, i32 14
+  // CHECK: insertelement <16 x half> {{.*}}, i32 15
+  return _mm256_set1_ph(h);
+}
+
+__m128h test_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                       _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
+  // CHECK-LABEL: @test_mm_set_ph
+  // CHECK: insertelement <8 x half> {{.*}}, i32 0
+  // CHECK: insertelement <8 x half> {{.*}}, i32 1
+  // CHECK: insertelement <8 x half> {{.*}}, i32 2
+  // CHECK: insertelement <8 x half> {{.*}}, i32 3
+  // CHECK: insertelement <8 x half> {{.*}}, i32 4
+  // CHECK: insertelement <8 x half> {{.*}}, i32 5
+  // CHECK: insertelement <8 x half> {{.*}}, i32 6
+  // CHECK: insertelement <8 x half> {{.*}}, i32 7
+  return _mm_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8);
+}
+
+__m256h test_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                          _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+                          _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+                          _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
+  // CHECK-LABEL: @test_mm256_set_ph
+  // CHECK: insertelement <16 x half> {{.*}}, i32 0
+  // CHECK: insertelement <16 x half> {{.*}}, i32 1
+  // CHECK: insertelement <16 x half> {{.*}}, i32 2
+  // CHECK: insertelement <16 x half> {{.*}}, i32 3
+  // CHECK: insertelement <16 x half> {{.*}}, i32 4
+  // CHECK: insertelement <16 x half> {{.*}}, i32 5
+  // CHECK: insertelement <16 x half> {{.*}}, i32 6
+  // CHECK: insertelement <16 x half> {{.*}}, i32 7
+  // CHECK: insertelement <16 x half> {{.*}}, i32 8
+  // CHECK: insertelement <16 x half> {{.*}}, i32 9
+  // CHECK: insertelement <16 x half> {{.*}}, i32 10
+  // CHECK: insertelement <16 x half> {{.*}}, i32 11
+  // CHECK: insertelement <16 x half> {{.*}}, i32 12
+  // CHECK: insertelement <16 x half> {{.*}}, i32 13
+  // CHECK: insertelement <16 x half> {{.*}}, i32 14
+  // CHECK: insertelement <16 x half> {{.*}}, i32 15
+  return _mm256_set_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
+                       __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16);
+}
+
+__m128h test_mm_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                        _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
+  // CHECK-LABEL: @test_mm_setr_ph
+  // CHECK: insertelement <8 x half> {{.*}}, i32 0
+  // CHECK: insertelement <8 x half> {{.*}}, i32 1
+  // CHECK: insertelement <8 x half> {{.*}}, i32 2
+  // CHECK: insertelement <8 x half> {{.*}}, i32 3
+  // CHECK: insertelement <8 x half> {{.*}}, i32 4
+  // CHECK: insertelement <8 x half> {{.*}}, i32 5
+  // CHECK: insertelement <8 x half> {{.*}}, i32 6
+  // CHECK: insertelement <8 x half> {{.*}}, i32 7
+  return _mm_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8);
+}
+
+__m256h test_mm256_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+                           _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+                           _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+                           _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
+  // CHECK-LABEL: @test_mm256_setr_ph
+  // CHECK: insertelement <16 x half> {{.*}}, i32 0
+  // CHECK: insertelement <16 x half> {{.*}}, i32 1
+  // CHECK: insertelement <16 x half> {{.*}}, i32 2
+  // CHECK: insertelement <16 x half> {{.*}}, i32 3
+  // CHECK: insertelement <16 x half> {{.*}}, i32 4
+  // CHECK: insertelement <16 x half> {{.*}}, i32 5
+  // CHECK: insertelement <16 x half> {{.*}}, i32 6
+  // CHECK: insertelement <16 x half> {{.*}}, i32 7
+  // CHECK: insertelement <16 x half> {{.*}}, i32 8
+  // CHECK: insertelement <16 x half> {{.*}}, i32 9
+  // CHECK: insertelement <16 x half> {{.*}}, i32 10
+  // CHECK: insertelement <16 x half> {{.*}}, i32 11
+  // CHECK: insertelement <16 x half> {{.*}}, i32 12
+  // CHECK: insertelement <16 x half> {{.*}}, i32 13
+  // CHECK: insertelement <16 x half> {{.*}}, i32 14
+  // CHECK: insertelement <16 x half> {{.*}}, i32 15
+  return _mm256_setr_ph(__h1, __h2, __h3, __h4, __h5, __h6, __h7, __h8,
+                        __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16);
+}
+
+__m128h test_mm_abs_ph(__m128h a) {
+  // CHECK-LABEL: @test_mm_abs_ph
+  // CHECK: and <4 x i32>
+  return _mm_abs_ph(a);
+}
+
+__m256h test_mm256_abs_ph(__m256h a) {
+  // CHECK-LABEL: @test_mm256_abs_ph
+  // CHECK: and <8 x i32>
+  return _mm256_abs_ph(a);
+}
+
+__m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
+  // CHECK-LABEL: @test_mm_mask_blend_ph
+  // CHECK:  %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK:  %{{.*}} = select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_mask_blend_ph(__U, __A, __W);
+}
+
+__m256h test_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
+  // CHECK-LABEL: @test_mm256_mask_blend_ph
+  // CHECK:  %{{.*}} = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK:  %{{.*}} = select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_mask_blend_ph(__U, __A, __W);
+}
+
+__m128h test_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
+  // CHECK-LABEL: @test_mm_permutex2var_ph
+  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x half>
+  return _mm_permutex2var_ph(__A, __I, __B);
+}
+
+__m256h test_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_permutex2var_ph
+  // CHECK:  %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_permutex2var_ph(__A, __I, __B);
+}
+
+__m128h test_mm_permutexvar_ph(__m128i __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_ph
+  // CHECK:  %{{.*}} = bitcast <8 x half> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = bitcast <2 x i64> %{{.*}} to <8 x i16>
+  // CHECK:  %{{.*}} = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <8 x i16> %{{.*}} to <8 x half>
+  return _mm_permutexvar_ph(__A, __B);
+}
+
+__m256h test_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_ph
+  // CHECK:  %{{.*}} = bitcast <16 x half> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = bitcast <4 x i64> %{{.*}} to <16 x i16>
+  // CHECK:  %{{.*}} = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  // CHECK:  %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_permutexvar_ph(__A, __B);
+}
diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@@ -54,9 +54,9 @@
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512fp16,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -293,3 +293,8 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s
 // AVX-VNNI: "-target-feature" "+avxvnni"
 // NO-AVX-VNNI: "-target-feature" "-avxvnni"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx512fp16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX512FP16 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx512fp16 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX512FP16 %s
+// AVX512FP16: "-target-feature" "+avx512fp16"
+// NO-AVX512FP16: "-target-feature" "-avx512fp16"
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1656,6 +1656,7 @@
 // CHECK_SPR_M32: #define __AVX512BW__ 1
 // CHECK_SPR_M32: #define __AVX512CD__ 1
 // CHECK_SPR_M32: #define __AVX512DQ__ 1
+// CHECK_SPR_M32: #define __AVX512FP16__ 1
 // CHECK_SPR_M32: #define __AVX512F__ 1
 // CHECK_SPR_M32: #define __AVX512IFMA__ 1
 // CHECK_SPR_M32: #define __AVX512VBMI2__ 1
@@ -1727,6 +1728,7 @@
 // CHECK_SPR_M64: #define __AVX512BW__ 1
 // CHECK_SPR_M64: #define __AVX512CD__ 1
 // CHECK_SPR_M64: #define __AVX512DQ__ 1
+// CHECK_SPR_M64: #define __AVX512FP16__ 1
 // CHECK_SPR_M64: #define __AVX512F__ 1
 // CHECK_SPR_M64: #define __AVX512IFMA__ 1
 // CHECK_SPR_M64: #define __AVX512VBMI2__ 1
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -558,3 +558,25 @@
 
 // AVXVNNINOAVX2-NOT: #define __AVX2__ 1
 // AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16 %s
+
+// AVX512FP16: #define __AVX512BW__ 1
+// AVX512FP16: #define __AVX512DQ__ 1
+// AVX512FP16: #define __AVX512FP16__ 1
+// AVX512FP16: #define __AVX512VL__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512VL %s
+
+// AVX512FP16NOAVX512VL-NOT: #define __AVX512FP16__ 1
+// AVX512FP16NOAVX512VL-NOT: #define __AVX512VL__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512BW %s
+
+// AVX512FP16NOAVX512BW-NOT: #define __AVX512BW__ 1
+// AVX512FP16NOAVX512BW-NOT: #define __AVX512FP16__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512fp16 -mno-avx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512FP16NOAVX512DQ %s
+
+// AVX512FP16NOAVX512DQ-NOT: #define __AVX512DQ__ 1
+// AVX512FP16NOAVX512DQ-NOT: #define __AVX512FP16__ 1
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -96,6 +96,8 @@
 
 During this release ...
 
+* Support for ``AVX512-FP16`` instructions has been added.
+
 Changes to the AMDGPU Target
 -----------------------------
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -312,6 +312,8 @@
 def llvm_v2f16_ty      : LLVMType<v2f16>;    //  2 x half (__fp16)
 def llvm_v4f16_ty      : LLVMType<v4f16>;    //  4 x half (__fp16)
 def llvm_v8f16_ty      : LLVMType<v8f16>;    //  8 x half (__fp16)
+def llvm_v16f16_ty     : LLVMType<v16f16>;   // 16 x half (__fp16)
+def llvm_v32f16_ty     : LLVMType<v32f16>;   // 32 x half (__fp16)
 def llvm_v2bf16_ty     : LLVMType<v2bf16>;   //  2 x bfloat (__bf16)
 def llvm_v4bf16_ty     : LLVMType<v4bf16>;   //  4 x bfloat (__bf16)
 def llvm_v8bf16_ty     : LLVMType<v8bf16>;   //  8 x bfloat (__bf16)
diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
--- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
+++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h
@@ -31,6 +31,8 @@
 #define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
 #define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
 #define THREEDNOW_MAP_SYM x86Disassembler3DNowOpcodes
+#define MAP5_SYM          x86DisassemblerMap5Opcodes
+#define MAP6_SYM          x86DisassemblerMap6Opcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -42,6 +44,8 @@
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 #define THREEDNOW_MAP_STR "x86Disassembler3DNowOpcodes"
+#define MAP5_STR          "x86DisassemblerMap5Opcodes"
+#define MAP6_STR          "x86DisassemblerMap6Opcodes"
 
 // Attributes of an instruction that must be known before the opcode can be
 // processed correctly.  Most of these indicate the presence of particular
@@ -292,7 +296,9 @@
   XOP8_MAP      = 4,
   XOP9_MAP      = 5,
   XOPA_MAP      = 6,
-  THREEDNOW_MAP = 7
+  THREEDNOW_MAP = 7,
+  MAP5          = 8,
+  MAP6          = 9
 };
 
 // The following structs are used for the hierarchical decode table.  After
diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@@ -193,6 +193,7 @@
 X86_FEATURE       (XSAVEOPT,        "xsaveopt")
 X86_FEATURE       (XSAVES,          "xsaves")
 X86_FEATURE       (HRESET,          "hreset")
+X86_FEATURE       (AVX512FP16,      "avx512fp16")
 X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1050,6 +1050,10 @@
   let IsLoad = true;
   let ScalarMemoryVT = i32;
 }
+def extloadvf16 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
+  let IsLoad = true;
+  let ScalarMemoryVT = f16;
+}
 def extloadvf32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> {
   let IsLoad = true;
   let ScalarMemoryVT = f32;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -820,6 +820,7 @@
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::STRICT_FP_TO_FP16:
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
@@ -871,13 +872,17 @@
   // We actually deal with the partially-softened FP_TO_FP16 node too, which
   // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
   assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 ||
+         N->getOpcode() == ISD::STRICT_FP_TO_FP16 ||
          N->getOpcode() == ISD::STRICT_FP_ROUND);
 
   bool IsStrict = N->isStrictFPOpcode();
   SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   EVT SVT = Op.getValueType();
   EVT RVT = N->getValueType(0);
-  EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
+  EVT FloatRVT = (N->getOpcode() == ISD::FP_TO_FP16 ||
+                  N->getOpcode() == ISD::STRICT_FP_TO_FP16)
+                     ? MVT::f16
+                     : RVT;
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -1633,6 +1633,7 @@
   // For more info, see X86 ISA docs.
   Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1);
   Features["amx-bf16"]   = HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave;
+  Features["avx512fp16"] = HasLeaf7 && ((EDX >> 23) & 1) && HasAVX512Save;
   Features["amx-tile"]   = HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave;
   Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -201,11 +201,11 @@
     FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B |
     FeatureCLWB | FeatureMOVDIRI | FeatureSHSTK | FeatureKL | FeatureWIDEKL;
 constexpr FeatureBitset FeaturesSapphireRapids =
-    FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 |
-    FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
-    FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
-    FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
-    FeatureWAITPKG | FeatureAVXVNNI;
+    FeaturesICLServer | FeatureAMX_BF16 | FeatureAMX_INT8 | FeatureAMX_TILE |
+    FeatureAVX512BF16 | FeatureAVX512FP16 | FeatureAVX512VP2INTERSECT |
+    FeatureAVXVNNI | FeatureCLDEMOTE | FeatureENQCMD | FeatureMOVDIR64B |
+    FeatureMOVDIRI | FeaturePTWRITE | FeatureSERIALIZE | FeatureSHSTK |
+    FeatureTSXLDTRK | FeatureUINTR | FeatureWAITPKG;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -576,6 +576,8 @@
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
+static constexpr FeatureBitset ImpliedFeaturesAVX512FP16 =
+    FeatureAVX512BW | FeatureAVX512DQ | FeatureAVX512VL;
 // Key Locker Features
 constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
 constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -2753,6 +2753,7 @@
               .Case("1to4", "{1to4}")
               .Case("1to8", "{1to8}")
               .Case("1to16", "{1to16}")
+              .Case("1to32", "{1to32}")
               .Default(nullptr);
       if (!BroadcastPrimitive)
         return TokError("Invalid memory broadcast primitive.");
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -150,6 +150,12 @@
     dec =
         &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case MAP5:
+    dec = &MAP5_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case MAP6:
+    dec = &MAP6_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -332,7 +338,7 @@
     }
 
     if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
-        ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+        ((~byte1 & 0x8) == 0x8) && ((byte2 & 0x4) == 0x4)) {
       insn->vectorExtensionType = TYPE_EVEX;
     } else {
       --insn->readerCursor; // unconsume byte1
@@ -876,11 +882,11 @@
 
   insn->opcodeType = ONEBYTE;
   if (insn->vectorExtensionType == TYPE_EVEX) {
-    switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+    switch (mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
     default:
       LLVM_DEBUG(
-          dbgs() << format("Unhandled mm field for instruction (0x%hhx)",
-                           mmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
+          dbgs() << format("Unhandled mmm field for instruction (0x%hhx)",
+                           mmmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
       return true;
     case VEX_LOB_0F:
       insn->opcodeType = TWOBYTE;
@@ -891,6 +897,12 @@
     case VEX_LOB_0F3A:
       insn->opcodeType = THREEBYTE_3A;
       return consume(insn, insn->opcode);
+    case VEX_LOB_MAP5:
+      insn->opcodeType = MAP5;
+      return consume(insn, insn->opcode);
+    case VEX_LOB_MAP6:
+      insn->opcodeType = MAP6;
+      return consume(insn, insn->opcode);
     }
   } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
     switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
@@ -908,6 +920,12 @@
     case VEX_LOB_0F3A:
       insn->opcodeType = THREEBYTE_3A;
       return consume(insn, insn->opcode);
+    case VEX_LOB_MAP5:
+      insn->opcodeType = MAP5;
+      return consume(insn, insn->opcode);
+    case VEX_LOB_MAP6:
+      insn->opcodeType = MAP6;
+      return consume(insn, insn->opcode);
     }
   } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
     insn->opcodeType = TWOBYTE;
@@ -1043,6 +1061,12 @@
   case THREEDNOW_MAP:
     decision = &THREEDNOW_MAP_SYM;
     break;
+  case MAP5:
+    decision = &MAP5_SYM;
+    break;
+  case MAP6:
+    decision = &MAP6_SYM;
+    break;
   }
 
   if (decision->opcodeDecisions[insnCtx]
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -37,7 +37,7 @@
 #define xFromEVEX2of4(evex)     (((~(evex)) & 0x40) >> 6)
 #define bFromEVEX2of4(evex)     (((~(evex)) & 0x20) >> 5)
 #define r2FromEVEX2of4(evex)    (((~(evex)) & 0x10) >> 4)
-#define mmFromEVEX2of4(evex)    ((evex) & 0x3)
+#define mmmFromEVEX2of4(evex)   ((evex) & 0x7)
 #define wFromEVEX3of4(evex)     (((evex) & 0x80) >> 7)
 #define vvvvFromEVEX3of4(evex)  (((~(evex)) & 0x78) >> 3)
 #define ppFromEVEX3of4(evex)    ((evex) & 0x3)
@@ -489,7 +489,9 @@
 enum VEXLeadingOpcodeByte {
   VEX_LOB_0F = 0x1,
   VEX_LOB_0F38 = 0x2,
-  VEX_LOB_0F3A = 0x3
+  VEX_LOB_0F3A = 0x3,
+  VEX_LOB_MAP5 = 0x5,
+  VEX_LOB_MAP6 = 0x6
 };
 
 enum XOPMapSelect {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -790,7 +790,7 @@
     // belongs to. i.e. one-byte, two-byte, 0x0f 0x38, 0x0f 0x3a, etc.
     //
     OpMapShift = OpPrefixShift + 2,
-    OpMapMask  = 0x7 << OpMapShift,
+    OpMapMask  = 0xF << OpMapShift,
 
     // OB - OneByte - Set if this instruction has a one byte opcode.
     OB = 0 << OpMapShift,
@@ -819,13 +819,17 @@
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
     ThreeDNow = 7 << OpMapShift,
 
+    // MAP5, MAP6 - Prefix after the 0x0F prefix.
+    T_MAP5 = 8 << OpMapShift,
+    T_MAP6 = 9 << OpMapShift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
     // etc. We only cares about REX.W and REX.R bits and only the former is
     // statically determined.
     //
-    REXShift    = OpMapShift + 3,
+    REXShift    = OpMapShift + 4,
     REX_W       = 1 << REXShift,
 
     //===------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -799,7 +799,10 @@
   //  0b00001: implied 0F leading opcode
   //  0b00010: implied 0F 38 leading opcode bytes
   //  0b00011: implied 0F 3A leading opcode bytes
-  //  0b00100-0b11111: Reserved for future use
+  //  0b00100: Reserved for future use
+  //  0b00101: VEX MAP5
+  //  0b00110: VEX MAP6
+  //  0b00111-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
   //  0b01001: XOP map select - 09h instructions with no imm byte
   //  0b01010: XOP map select - 0Ah instructions with imm dword
@@ -825,6 +828,12 @@
   case X86II::XOPA:
     VEX_5M = 0xA;
     break;
+  case X86II::T_MAP5:
+    VEX_5M = 0x5;
+    break;
+  case X86II::T_MAP6:
+    VEX_5M = 0x6;
+    break;
   }
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -1173,10 +1182,10 @@
     // EVEX opcode prefix can have 4 bytes
     //
     // +-----+ +--------------+ +-------------------+ +------------------------+
-    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // | 62h | | RXBR' | 0mmm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
     // +-----+ +--------------+ +-------------------+ +------------------------+
-    assert((VEX_5M & 0x3) == VEX_5M &&
-           "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+    assert((VEX_5M & 0x7) == VEX_5M &&
+           "More than 3 significant bits in VEX.m-mmmm fields for EVEX!");
 
     emitByte(0x62, OS);
     emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -170,6 +170,14 @@
                                             "HasVP2INTERSECT", "true",
                                             "Enable AVX-512 vp2intersect",
                                             [FeatureAVX512]>;
+// FIXME: FP16 scalar intrinsics use the type v8f16, which is supposed to be
+// guarded under condition hasVLX. So we imply it in FeatureFP16 currently.
+// FIXME: FP16 conversion between f16 and i64 customize type v8i64, which is
+// supposed to be guarded under condition hasDQI. So we imply it in FeatureFP16
+// currently.
+def FeatureFP16    : SubtargetFeature<"avx512fp16", "HasFP16", "true",
+                           "Support 16-bit floating point",
+                           [FeatureBWI, FeatureVLX, FeatureDQI]>;
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
@@ -821,6 +829,7 @@
                                                   FeatureCLDEMOTE,
                                                   FeatureWAITPKG,
                                                   FeaturePTWRITE,
+                                                  FeatureFP16,
                                                   FeatureAVXVNNI,
                                                   FeatureTSXLDTRK,
                                                   FeatureENQCMD,
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -233,19 +233,19 @@
   // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
   // can only be used by ABI non-compliant code. If the target doesn't have XMM
   // registers, it won't have vector types.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
             CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
 
   // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3
   // can only be used by ABI non-compliant code. This vector type is only
   // supported while using the AVX target feature.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
             CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
 
   // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
   // can only be used by ABI non-compliant code. This vector type is only
   // supported while using the AVX-512 target feature.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
 
   // MMX vector types are always returned in MM0. If the target doesn't have
@@ -267,6 +267,7 @@
   CCIfInReg<CCIfSubtarget<"hasSSE2()",
     CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
   CCIfType<[f32,f64], CCAssignToReg<[FP0, FP1]>>,
+  CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>,
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
@@ -329,6 +330,7 @@
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f16], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
   CCIfType<[f128], CCAssignToReg<[XMM0, XMM1]>>,
@@ -552,7 +554,7 @@
   CCIfType<[v64i1], CCPromoteToType<v64i8>>,
 
   // The first 8 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
             CCIfSubtarget<"hasSSE1()",
             CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
 
@@ -561,33 +563,33 @@
   // FIXME: This isn't precisely correct; the x86-64 ABI document says that
   // fixed arguments to vararg functions are supposed to be passed in
   // registers.  Actually modeling that would be a lot of work, though.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                           CCIfSubtarget<"hasAVX()",
                           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
                                          YMM4, YMM5, YMM6, YMM7]>>>>,
 
   // The first 8 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
             CCIfSubtarget<"hasAVX512()",
             CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  CCIfType<[i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>,
 
   // Long doubles get stack slots whose size and alignment depends on the
   // subtarget.
   CCIfType<[f80, f128], CCAssignToStack<0, 0>>,
 
   // Vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>,
 
   // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
            CCAssignToStack<32, 32>>,
 
   // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
            CCAssignToStack<64, 64>>
 ]>;
 
@@ -635,13 +637,13 @@
   CCIfCFGuardTarget<CCAssignToReg<[RAX]>>,
 
   // 128 bit vectors are passed by pointer
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect<i64>>,
 
   // 256 bit vectors are passed by pointer
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect<i64>>,
 
   // 512 bit vectors are passed by pointer
-  CCIfType<[v64i8, v32i16, v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+  CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
 
   // Long doubles are passed by pointer
   CCIfType<[f80], CCPassIndirect<i64>>,
@@ -655,7 +657,7 @@
   CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
 
   // The first 4 FP/Vector arguments are passed in XMM registers.
-  CCIfType<[f32, f64],
+  CCIfType<[f16, f32, f64],
            CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
                                    [RCX , RDX , R8  , R9  ]>>,
 
@@ -678,7 +680,7 @@
 
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
-  CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
+  CCIfType<[i8, i16, i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
 def CC_X86_Win64_VectorCall : CallingConv<[
@@ -757,14 +759,15 @@
 /// values are spilled on the stack.
 def CC_X86_32_Vector_Common : CallingConv<[
   // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+           CCAssignToStack<16, 16>>,
 
   // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
            CCAssignToStack<32, 32>>,
 
   // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
            CCAssignToStack<64, 64>>
 ]>;
 
@@ -772,16 +775,16 @@
 // vector registers
 def CC_X86_32_Vector_Standard : CallingConv<[
   // SSE vector arguments are passed in XMM registers.
-  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
                 CCAssignToReg<[XMM0, XMM1, XMM2]>>>,
 
   // AVX 256-bit vector arguments are passed in YMM registers.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                 CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2]>>>,
 
   CCDelegateTo<CC_X86_32_Vector_Common>
@@ -791,16 +794,16 @@
 // vector registers.
 def CC_X86_32_Vector_Darwin : CallingConv<[
   // SSE vector arguments are passed in XMM registers.
-  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
                 CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
 
   // AVX 256-bit vector arguments are passed in YMM registers.
-  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+  CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64],
                 CCIfSubtarget<"hasAVX()",
                 CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
 
   // AVX 512-bit vector arguments are passed in ZMM registers.
-  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
+  CCIfNotVarArg<CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64],
                 CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>>,
 
   CCDelegateTo<CC_X86_32_Vector_Common>
@@ -819,11 +822,15 @@
                 CCIfSubtarget<"hasSSE2()",
                 CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
 
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f16], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
   // The first 3 __m64 vector arguments are passed in mmx registers if the
   // call is not a vararg call.
   CCIfNotVarArg<CCIfType<[x86mmx],
                 CCAssignToReg<[MM0, MM1, MM2]>>>,
 
+  CCIfType<[f16], CCAssignToStack<4, 4>>,
+
   // Integer/Float values get stored in stack slots that are 4 bytes in
   // size and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -55,6 +55,7 @@
   /// When SSE2 is available, use it for f64 operations.
   bool X86ScalarSSEf64;
   bool X86ScalarSSEf32;
+  bool X86ScalarSSEf16;
 
 public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
@@ -63,6 +64,7 @@
     Subtarget = &funcInfo.MF->getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
+    X86ScalarSSEf16 = Subtarget->hasFP16();
   }
 
   bool fastSelectInstruction(const Instruction *I) override;
@@ -157,7 +159,8 @@
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
     return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+           (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
+           (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
   }
 
   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
@@ -2283,9 +2286,10 @@
   unsigned Opc;
   switch (RetVT.SimpleTy) {
   default: return false;
-  case MVT::i8:  Opc = X86::CMOV_GR8;  break;
-  case MVT::i16: Opc = X86::CMOV_GR16; break;
-  case MVT::i32: Opc = X86::CMOV_GR32; break;
+  case MVT::i8:  Opc = X86::CMOV_GR8;   break;
+  case MVT::i16: Opc = X86::CMOV_GR16;  break;
+  case MVT::f16: Opc = X86::CMOV_FR16X; break;
+  case MVT::i32: Opc = X86::CMOV_GR32;  break;
   case MVT::f32: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR32X
                                               : X86::CMOV_FR32; break;
   case MVT::f64: Opc = Subtarget->hasAVX512() ? X86::CMOV_FR64X
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1121,7 +1121,10 @@
       if (VT.isVector() || VT == MVT::f128)
         break;
 
-      MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32;
+      MVT VecVT = VT == MVT::f64   ? MVT::v2f64
+                  : VT == MVT::f32 ? MVT::v4f32
+                                   : MVT::v8f16;
+
       SDLoc dl(N);
       SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
                                     N->getOperand(0));
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -461,6 +461,7 @@
     MOVHLPS,
     MOVSD,
     MOVSS,
+    MOVSH,
     UNPCKL,
     UNPCKH,
     VPERMILPV,
@@ -999,7 +1000,8 @@
     bool isCtlzFast() const override;
 
     bool hasBitPreservingFPLogic(EVT VT) const override {
-      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
+      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector() ||
+             (VT == MVT::f16 && X86ScalarSSEf16);
     }
 
     bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const override {
@@ -1283,7 +1285,8 @@
     /// register, not on the X87 floating point stack.
     bool isScalarFPTypeInSSEReg(EVT VT) const {
       return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
-             (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+             (VT == MVT::f32 && X86ScalarSSEf32) || // f32 is when SSE1
+             (VT == MVT::f16 && X86ScalarSSEf16);   // f16 is when AVX512FP16
     }
 
     /// Returns true if it is beneficial to convert a load of a constant
@@ -1443,6 +1446,7 @@
     /// When SSE2 is available, use it for f64 operations.
     bool X86ScalarSSEf32;
     bool X86ScalarSSEf64;
+    bool X86ScalarSSEf16;
 
     /// A list of legal FP immediates.
     std::vector<APFloat> LegalFPImmediates;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -110,6 +110,7 @@
   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
   X86ScalarSSEf64 = Subtarget.hasSSE2();
   X86ScalarSSEf32 = Subtarget.hasSSE1();
+  X86ScalarSSEf16 = Subtarget.hasFP16();
   MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
 
   // Set up the TargetLowering object.
@@ -1897,6 +1898,71 @@
     }
   }
 
+  if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
+    auto setGroup = [&] (MVT VT) {
+      setOperationAction(ISD::LOAD,               VT, Legal);
+      setOperationAction(ISD::STORE,              VT, Legal);
+
+      setOperationAction(ISD::VSELECT,            VT, Legal);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::SELECT,             VT, Custom);
+
+      setOperationAction(ISD::FNEG,               VT, Custom);
+      setOperationAction(ISD::FABS,               VT, Custom);
+      setOperationAction(ISD::FCOPYSIGN,          VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+    };
+
+    // AVX512_FP16 scalar operations
+    setGroup(MVT::f16);
+    addRegisterClass(MVT::f16,    &X86::FR16XRegClass);
+
+    if (Subtarget.useAVX512Regs()) {
+      setGroup(MVT::v32f16);
+      addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
+
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
+
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
+    }
+
+    if (Subtarget.hasVLX()) {
+      addRegisterClass(MVT::v8f16,  &X86::VR128XRegClass);
+      addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
+      setGroup(MVT::v8f16);
+      setGroup(MVT::v16f16);
+
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
+
+      // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
+      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v16f16, Custom);
+
+      setOperationAction(ISD::EXTRACT_SUBVECTOR,    MVT::v8f16, Legal);
+      setOperationAction(ISD::INSERT_SUBVECTOR,     MVT::v16f16, Legal);
+      setOperationAction(ISD::CONCAT_VECTORS,       MVT::v16f16, Custom);
+
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
+      setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
+
+      // Need to custom widen these to prevent scalarization.
+      setOperationAction(ISD::LOAD,  MVT::v4f16, Custom);
+      setOperationAction(ISD::STORE, MVT::v4f16, Custom);
+    }
+
+    // Support fp16 0 immediate
+    addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
     setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
@@ -2160,6 +2226,11 @@
       return RegisterVT;
   }
 
+  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
+  // So its default register type is f16. We override the type to v8f16 here.
+  if (VT == MVT::v3f16 && Subtarget.hasFP16())
+    return MVT::v8f16;
+
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
 
@@ -2178,6 +2249,11 @@
       return NumRegisters;
   }
 
+  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
+  // So its default register number is 3. We override the number to 1 here.
+  if (VT == MVT::v3f16 && Subtarget.hasFP16())
+    return 1;
+
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
@@ -3665,6 +3741,8 @@
           RC = &X86::GR32RegClass;
         else if (Is64Bit && RegVT == MVT::i64)
           RC = &X86::GR64RegClass;
+        else if (RegVT == MVT::f16)
+          RC = &X86::FR16XRegClass;
         else if (RegVT == MVT::f32)
           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
         else if (RegVT == MVT::f64)
@@ -4867,6 +4945,7 @@
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::UNPCKL:
   case X86ISD::UNPCKH:
   case X86ISD::VBROADCAST:
@@ -7153,6 +7232,7 @@
     break;
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
     DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
@@ -8690,7 +8770,8 @@
 
   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
-      (LoadSizeInBits == 32 || LoadSizeInBits == 64) &&
+      ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
+       LoadSizeInBits == 64) &&
       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
                                       : MVT::getIntegerVT(LoadSizeInBits);
@@ -9014,6 +9095,7 @@
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
+        (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -9076,6 +9158,9 @@
     return BCast;
   }
 
+  if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
+    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+
   // Unsupported broadcast.
   return SDValue();
 }
@@ -10476,13 +10561,15 @@
       if (NumZero == 0)
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
 
-      if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
-          (EltVT == MVT::i64 && Subtarget.is64Bit())) {
+      if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
+          EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
+          (EltVT == MVT::i16 && Subtarget.hasFP16())) {
         assert((VT.is128BitVector() || VT.is256BitVector() ||
                 VT.is512BitVector()) &&
                "Expected an SSE value type!");
         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
-        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
+        // zero vector.
         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
       }
 
@@ -10612,7 +10699,7 @@
                                           DAG, Subtarget))
       return V;
 
-  if (EVTBits == 16 && NumElems == 8)
+  if (EltVT == MVT::i16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,
                                           DAG, Subtarget))
       return V;
@@ -10669,7 +10756,7 @@
     return Sh;
 
   // For SSE 4.1, use insertps to put the high elements into the low element.
-  if (Subtarget.hasSSE41()) {
+  if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
     SDValue Result;
     if (!Op.getOperand(0).isUndef())
       Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -13467,7 +13554,7 @@
   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
     // We need to zext the scalar if it is smaller than an i32.
     V2S = DAG.getBitcast(EltVT, V2S);
-    if (EltVT == MVT::i8 || EltVT == MVT::i16) {
+    if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
       // Using zext to expand a narrow element won't work for non-zero
       // insertions.
       if (!IsV1Zeroable)
@@ -13499,11 +13586,17 @@
     if (!VT.is128BitVector())
       return SDValue();
 
-    // Otherwise, use MOVSD or MOVSS.
-    assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
-           "Only two types of floating point element types to handle!");
-    return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
-                       ExtVT, V1, V2);
+    // Otherwise, use MOVSD, MOVSS or MOVSH.
+    unsigned MovOpc = 0;
+    if (EltVT == MVT::f16)
+      MovOpc = X86ISD::MOVSH;
+    else if (EltVT == MVT::f32)
+      MovOpc = X86ISD::MOVSS;
+    else if (EltVT == MVT::f64)
+      MovOpc = X86ISD::MOVSD;
+    else
+      llvm_unreachable("Unsupported floating point element type to handle!");
+    return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
   }
 
   // This lowering only works for the low element with floating point vectors.
@@ -15305,6 +15398,33 @@
                                               Mask, Subtarget, DAG);
 }
 
+/// Lower 8-lane 16-bit floating point shuffles.
+static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
+                                 const APInt &Zeroable, SDValue V1, SDValue V2,
+                                 const X86Subtarget &Subtarget,
+                                 SelectionDAG &DAG) {
+  assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
+  assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
+  assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
+  int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
+
+  if (NumV2Elements == 0) {
+    // Check for being able to broadcast a single element.
+    if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
+                                                    Mask, Subtarget, DAG))
+      return Broadcast;
+  }
+  if (NumV2Elements == 1 && Mask[0] >= 8)
+    if (SDValue V = lowerShuffleAsElementInsertion(DL, MVT::v8f16, V1, V2, Mask,
+                                                   Zeroable, Subtarget, DAG))
+      return V;
+
+  V1 = DAG.getBitcast(MVT::v8i16, V1);
+  V2 = DAG.getBitcast(MVT::v8i16, V2);
+  return DAG.getBitcast(MVT::v8f16,
+                        DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
+}
+
 // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
 // sub-512-bit shuffles are padded to 512-bits for the shuffle and then
 // the active subvector is extracted.
@@ -15710,6 +15830,8 @@
     return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v8i16:
     return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
+  case MVT::v8f16:
+    return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
   case MVT::v16i8:
     return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
 
@@ -17574,6 +17696,13 @@
     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
   }
 
+  if (VT == MVT::v16f16) {
+    V1 = DAG.getBitcast(MVT::v16i16, V1);
+    V2 = DAG.getBitcast(MVT::v16i16, V2);
+    return DAG.getBitcast(MVT::v16f16,
+                          DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
+  }
+
   switch (VT.SimpleTy) {
   case MVT::v4f64:
     return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
@@ -18140,6 +18269,13 @@
     return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
   }
 
+  if (VT == MVT::v32f16) {
+    V1 = DAG.getBitcast(MVT::v32i16, V1);
+    V2 = DAG.getBitcast(MVT::v32i16, V2);
+    return DAG.getBitcast(MVT::v32f16,
+                          DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
+  }
+
   // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
@@ -18845,14 +18981,18 @@
 
   MVT VT = Op.getSimpleValueType();
 
-  if (VT.getSizeInBits() == 16) {
+  if (VT == MVT::i16) {
     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
     // we're going to zero extend the register or fold the store (SSE41 only).
     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
-        !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
+        !(Subtarget.hasSSE41() && MayFoldIntoStore(Op))) {
+      if (Subtarget.hasFP16())
+        return Op;
+
       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
+    }
 
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
                                   DAG.getTargetConstant(IdxVal, dl, MVT::i8));
@@ -18891,12 +19031,13 @@
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
   }
 
-  if (VT.getSizeInBits() == 32) {
+  if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
       return Op;
 
-    // SHUFPS the element to the lowest double word, then movss.
-    int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
+    // Shuffle the element to the lowest element, then movss or movsh.
+    SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);
+    Mask[0] = static_cast<int>(IdxVal);
     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                        DAG.getIntPtrConstant(0, dl));
@@ -19046,10 +19187,10 @@
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  // This will be just movd/movq/movss/movsd.
+  // This will be just movw/movd/movq/movsh/movss/movsd.
   if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
     if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
-        EltVT == MVT::i64) {
+        EltVT == MVT::f16 || EltVT == MVT::i64) {
       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
       return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
     }
@@ -19148,8 +19289,9 @@
   assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
          "Expected an SSE type!");
 
-  // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
-  if (OpVT == MVT::v4i32)
+  // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
+  // tblgen.
+  if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
     return Op;
 
   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
@@ -22031,9 +22173,8 @@
   MVT VT = Op.getSimpleValueType();
 
   bool IsF128 = (VT == MVT::f128);
-  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
-          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
-          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+  assert(VT.isFloatingPoint() && VT != MVT::f80 &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Unexpected type in LowerFABSorFNEG");
 
   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
@@ -22047,7 +22188,9 @@
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
-    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    LogicVT = (VT == MVT::f64)   ? MVT::v2f64
+              : (VT == MVT::f32) ? MVT::v4f32
+                                 : MVT::v8f16;
 
   unsigned EltBits = VT.getScalarSizeInBits();
   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
@@ -22092,9 +22235,8 @@
   // At this point the operands and the result should have the same
   // type, and that won't be f80 since that is not custom lowered.
   bool IsF128 = (VT == MVT::f128);
-  assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
-          VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
-          VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
+  assert(VT.isFloatingPoint() && VT != MVT::f80 &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Unexpected type in LowerFCOPYSIGN");
 
   const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
@@ -22107,7 +22249,9 @@
   bool IsFakeVector = !VT.isVector() && !IsF128;
   MVT LogicVT = VT;
   if (IsFakeVector)
-    LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
+    LogicVT = (VT == MVT::f64)   ? MVT::v2f64
+              : (VT == MVT::f32) ? MVT::v4f32
+                                 : MVT::v8f16;
 
   // The mask constants are automatically splatted for vector types.
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
@@ -23081,7 +23225,7 @@
   if (isFP) {
 #ifndef NDEBUG
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
-    assert(EltVT == MVT::f32 || EltVT == MVT::f64);
+    assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
@@ -23095,7 +23239,10 @@
     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
         (!IsStrict || Subtarget.hasVLX() ||
          Op0.getSimpleValueType().is512BitVector())) {
-      assert(VT.getVectorNumElements() <= 16);
+#ifndef NDEBUG
+      unsigned Num = VT.getVectorNumElements();
+      assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
+#endif
       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
     } else {
       Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
@@ -31046,8 +31193,9 @@
 
     assert(!VT.isVector() && "Vectors should have been handled above!");
 
-    if (Subtarget.hasDQI() && VT == MVT::i64 &&
-        (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
+    if ((Subtarget.hasDQI() && VT == MVT::i64 &&
+         (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
+        (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
       assert(!Subtarget.is64Bit() && "i64 should be legal");
       unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
       // If we use a 128-bit result we might need to use a target specific node.
@@ -31701,6 +31849,7 @@
   NODE_NAME_CASE(MOVSLDUP)
   NODE_NAME_CASE(MOVSD)
   NODE_NAME_CASE(MOVSS)
+  NODE_NAME_CASE(MOVSH)
   NODE_NAME_CASE(UNPCKL)
   NODE_NAME_CASE(UNPCKH)
   NODE_NAME_CASE(VBROADCAST)
@@ -32651,6 +32800,7 @@
 // conditional jump around it.
 static bool isCMOVPseudo(MachineInstr &MI) {
   switch (MI.getOpcode()) {
+  case X86::CMOV_FR16X:
   case X86::CMOV_FR32:
   case X86::CMOV_FR32X:
   case X86::CMOV_FR64:
@@ -35272,17 +35422,15 @@
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
-  // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
-  if (MaskEltSize == 32 && Mask[0] == 0) {
-    if (isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
+  // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
+  if (Mask[0] == 0 &&
+      (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
+    if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
+        (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+         isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
       Shuffle = X86ISD::VZEXT_MOVL;
-      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-      return true;
-    }
-    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-        isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
-      Shuffle = X86ISD::VZEXT_MOVL;
-      SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+      SrcVT = DstVT =
+          !Subtarget.hasSSE2() && MaskEltSize == 32 ? MVT::v4f32 : MaskVT;
       return true;
     }
   }
@@ -35576,6 +35724,12 @@
       SrcVT = DstVT = MVT::v4f32;
       return true;
     }
+    if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}) &&
+        Subtarget.hasFP16()) {
+      Shuffle = X86ISD::MOVSH;
+      SrcVT = DstVT = MVT::v8f16;
+      return true;
+    }
   }
 
   // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
@@ -37009,6 +37163,10 @@
   if (!VT.isVector() || !VT.isSimple())
     return SDValue(); // Bail if we hit a non-simple non-vector.
 
+  // FIXME: Just bail on f16 for now.
+  if (VT.getVectorElementType() == MVT::f16)
+    return SDValue();
+
   assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
          "Can only combine shuffles upto size of the root op.");
 
@@ -38135,6 +38293,7 @@
     assert(Mask.size() == 4);
     break;
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::MOVSS: {
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
@@ -38519,6 +38678,12 @@
   if (VT.is512BitVector())
     return SDValue();
 
+  // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
+  // the ADDSUB idiom has been successfully recognized. There are no known
+  // X86 targets with FP16 ADDSUB instructions!
+  if (VT.getVectorElementType() == MVT::f16)
+    return SDValue();
+
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
 
@@ -40497,6 +40662,7 @@
   // Check if we have a bitcast from another integer type as well.
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
+        (Subtarget.hasFP16() && VT == MVT::f16) ||
         (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
          TLI.isTypeLegal(VT))))
     return SDValue();
@@ -41072,7 +41238,8 @@
 
 /// Extracting a scalar FP value from vector element 0 is free, so extract each
 /// operand first, then perform the math as a scalar op.
-static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
+static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
   assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
   SDValue Vec = ExtElt->getOperand(0);
   SDValue Index = ExtElt->getOperand(1);
@@ -41100,7 +41267,8 @@
     return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
   }
 
-  if (VT != MVT::f32 && VT != MVT::f64)
+  if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
+      VT != MVT::f64)
     return SDValue();
 
   // Vector FP selects don't fit the pattern of FP math ops (because the
@@ -41414,7 +41582,7 @@
   if (SDValue V = combineArithReduction(N, DAG, Subtarget))
     return V;
 
-  if (SDValue V = scalarizeExtEltFP(N, DAG))
+  if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
     return V;
 
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
@@ -44393,7 +44561,8 @@
     SDValue CMP01 = CMP0->getOperand(1);
     EVT     VT    = CMP00.getValueType();
 
-    if (VT == MVT::f32 || VT == MVT::f64) {
+    if (VT == MVT::f32 || VT == MVT::f64 ||
+        (VT == MVT::f16 && Subtarget.hasFP16())) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -44640,9 +44809,9 @@
   EVT N10Type = N10.getValueType();
 
   // Ensure that both types are the same and are legal scalar fp types.
-  if (N00Type != N10Type ||
-      !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
-        (Subtarget.hasSSE2() && N00Type == MVT::f64)))
+  if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
+                              (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
+                              (Subtarget.hasFP16() && N00Type == MVT::f16)))
     return SDValue();
 
   unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
@@ -51390,6 +51559,7 @@
   case X86ISD::MOVDDUP:
   case X86ISD::MOVSS:
   case X86ISD::MOVSD:
+  case X86ISD::MOVSH:
   case X86ISD::VBROADCAST:
   case X86ISD::VPPERM:
   case X86ISD::VPERMI:
@@ -52142,7 +52312,8 @@
 /// Check if \p RC is a vector register class.
 /// I.e., FR* / VR* or one of their variant.
 static bool isFRClass(const TargetRegisterClass &RC) {
-  return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
+  return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
+         RC.hasSuperClassEq(&X86::FR32XRegClass) ||
          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -44,8 +44,9 @@
   // It is a little bit complex for scalar types, where NumElts = 1.
   // In this case we build v4f32 or v2f64
   string VTName = "v" # !if (!eq (NumElts, 1),
+                        !if (!eq (EltVT.Size, 16), 8,
                         !if (!eq (EltVT.Size, 32), 4,
-                        !if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts) # EltVT;
+                        !if (!eq (EltVT.Size, 64), 2, NumElts))), NumElts) # EltVT;
 
   // The vector VT.
   ValueType VT = !cast<ValueType>(VTName);
@@ -65,8 +66,9 @@
   X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
   X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
   // FP scalar memory operand for intrinsics - ssmem/sdmem.
-  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
-                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
+  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f16"), !cast<Operand>("shmem"),
+                           !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?)));
 
   // Load patterns
   PatFrag LdFrag = !cast<PatFrag>("load" # VTName);
@@ -76,11 +78,9 @@
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
   PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
 
-  PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
-                                           !cast<PatFrags>("sse_load_f32"),
-                               !if (!eq (EltTypeName, "f64"),
-                                     !cast<PatFrags>("sse_load_f64"),
-                               ?));
+  PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f16"), !cast<PatFrags>("sse_load_f16"),
+                               !if (!eq (EltTypeName, "f32"), !cast<PatFrags>("sse_load_f32"),
+                               !if (!eq (EltTypeName, "f64"), !cast<PatFrags>("sse_load_f64"), ?)));
 
   // The string to specify embedded broadcast in assembly.
   string BroadcastStr = "{1to" # NumElts # "}";
@@ -95,9 +95,12 @@
 
   Domain ExeDomain = !if (!eq (EltTypeName, "f32"), SSEPackedSingle,
                      !if (!eq (EltTypeName, "f64"), SSEPackedDouble,
-                     SSEPackedInt));
+                     !if (!eq (EltTypeName, "f16"), SSEPackedSingle, // FIXME?
+                     SSEPackedInt)));
 
-  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X, FR64X);
+  RegisterClass FRC = !if (!eq (EltTypeName, "f32"), FR32X,
+                      !if (!eq (EltTypeName, "f16"), FR16X,
+                      FR64X));
 
   dag ImmAllZerosV = (VT immAllZerosV);
 
@@ -109,6 +112,7 @@
 def v32i16_info : X86VectorVTInfo<32, i16, VR512, "w">;
 def v16i32_info : X86VectorVTInfo<16, i32, VR512, "d">;
 def v8i64_info  : X86VectorVTInfo<8,  i64, VR512, "q">;
+def v32f16_info : X86VectorVTInfo<32, f16, VR512, "ph">;
 def v16f32_info : X86VectorVTInfo<16, f32, VR512, "ps">;
 def v8f64_info  : X86VectorVTInfo<8,  f64, VR512, "pd">;
 
@@ -117,6 +121,7 @@
 def v16i16x_info : X86VectorVTInfo<16, i16, VR256X, "w">;
 def v8i32x_info  : X86VectorVTInfo<8,  i32, VR256X, "d">;
 def v4i64x_info  : X86VectorVTInfo<4,  i64, VR256X, "q">;
+def v16f16x_info : X86VectorVTInfo<16, f16, VR256X, "ph">;
 def v8f32x_info  : X86VectorVTInfo<8,  f32, VR256X, "ps">;
 def v4f64x_info  : X86VectorVTInfo<4,  f64, VR256X, "pd">;
 
@@ -124,6 +129,7 @@
 def v8i16x_info  : X86VectorVTInfo<8,  i16, VR128X, "w">;
 def v4i32x_info  : X86VectorVTInfo<4,  i32, VR128X, "d">;
 def v2i64x_info  : X86VectorVTInfo<2,  i64, VR128X, "q">;
+def v8f16x_info  : X86VectorVTInfo<8,  f16, VR128X, "ph">;
 def v4f32x_info  : X86VectorVTInfo<4,  f32, VR128X, "ps">;
 def v2f64x_info  : X86VectorVTInfo<2,  f64, VR128X, "pd">;
 
@@ -131,6 +137,7 @@
 // with the appropriate element type. This allows to use the same masking logic.
 def i32x_info    : X86VectorVTInfo<1,  i32, GR32, "si">;
 def i64x_info    : X86VectorVTInfo<1,  i64, GR64, "sq">;
+def f16x_info    : X86VectorVTInfo<1,  f16, VR128X, "sh">;
 def f32x_info    : X86VectorVTInfo<1,  f32, VR128X, "ss">;
 def f64x_info    : X86VectorVTInfo<1,  f64, VR128X, "sd">;
 
@@ -149,6 +156,8 @@
                                              v4i32x_info>;
 def avx512vl_i64_info : AVX512VLVectorVTInfo<v8i64_info, v4i64x_info,
                                              v2i64x_info>;
+def avx512vl_f16_info : AVX512VLVectorVTInfo<v32f16_info, v16f16x_info,
+                                             v8f16x_info>;
 def avx512vl_f32_info : AVX512VLVectorVTInfo<v16f32_info, v8f32x_info,
                                              v4f32x_info>;
 def avx512vl_f64_info : AVX512VLVectorVTInfo<v8f64_info, v4f64x_info,
@@ -501,6 +510,12 @@
 def : Pat<(v4f64 immAllZerosV), (AVX512_256_SET0)>;
 }
 
+let Predicates = [HasFP16] in {
+def : Pat<(v8f16 immAllZerosV), (AVX512_128_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX512_256_SET0)>;
+def : Pat<(v32f16 immAllZerosV), (AVX512_512_SET0)>;
+}
+
 // Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
@@ -513,6 +528,12 @@
                             [(set VR128X:$dst, fp128imm0)]>;
 }
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasFP16] in {
+  def AVX512_FsFLD0SH : I<0, Pseudo, (outs FR16X:$dst), (ins), "",
+                          [(set FR16X:$dst, fp16imm0)]>;
+}
+
 //===----------------------------------------------------------------------===//
 // AVX-512 - VECTOR INSERT
 //
@@ -649,16 +670,22 @@
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v8f16x_info, v16f16x_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16, HasVLX]>;
 // Codegen pattern with the alternative types insert VEC128 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
               vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
                vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v8f16x_info, v32f16_info,
+              vinsert128_insert, INSERT_get_vinsert128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types insert VEC256 into VEC512
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
 defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
               vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v16f16x_info, v32f16_info,
+              vinsert256_insert, INSERT_get_vinsert256_imm, [HasFP16]>;
 
 
 multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
@@ -944,17 +971,23 @@
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
           vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v16f16x_info, v8f16x_info,
+          vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16, HasVLX]>;
 
 // Codegen pattern with the alternative types extract VEC128 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
                  vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v32f16_info, v8f16x_info,
+                 vextract128_extract, EXTRACT_get_vextract128_imm, [HasFP16]>;
 // Codegen pattern with the alternative types extract VEC256 from VEC512
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
 defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
                  vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v32f16_info, v16f16x_info,
+                 vextract256_extract, EXTRACT_get_vextract256_imm, [HasFP16]>;
 
 
 // A 128-bit extract from bits [255:128] of a 512-bit vector should use a
@@ -1015,6 +1048,12 @@
                   (iPTR 1)))>;
 }
 
+let Predicates = [HasFP16, HasVLX] in
+def : Pat<(v8f16 (extract_subvector (v32f16 VR512:$src), (iPTR 8))),
+          (v8f16 (VEXTRACTF32x4Z256rr
+                  (v16f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_ymm)),
+                  (iPTR 1)))>;
+
 
 // Additional patterns for handling a bitcast between the vselect and the
 // extract_subvector.
@@ -1439,6 +1478,31 @@
                            Sched<[SchedWriteShuffle.YMM.Folded]>,
                            AVX5128IBase, EVEX;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v32f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZrm addr:$src)>;
+
+  def : Pat<(v32f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZrr VR128X:$src)>;
+  def : Pat<(v32f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZrr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+}
+let Predicates = [HasVLX, HasFP16] in {
+  def : Pat<(v8f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZ128rm addr:$src)>;
+  def : Pat<(v16f16 (X86VBroadcastld16 addr:$src)),
+            (VPBROADCASTWZ256rm addr:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZ128rr VR128X:$src)>;
+  def : Pat<(v16f16 (X86VBroadcast (v8f16 VR128X:$src))),
+            (VPBROADCASTWZ256rr VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZ128rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+  def : Pat<(v16f16 (X86VBroadcast (f16 FR16X:$src))),
+            (VPBROADCASTWZ256rr (COPY_TO_REGCLASS FR16X:$src, VR128X))>;
+}
 
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST SUBVECTORS
@@ -1462,6 +1526,8 @@
           (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v16f32 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v32f16 (X86SubVBroadcastld256 addr:$src)),
+          (VBROADCASTF64X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcastld256 addr:$src)),
           (VBROADCASTI64X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcastld256 addr:$src)),
@@ -1475,6 +1541,8 @@
           (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v16f32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v32f16 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4rm addr:$src)>;
 def : Pat<(v8i64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4rm addr:$src)>;
 def : Pat<(v16i32 (X86SubVBroadcastld128 addr:$src)),
@@ -1532,6 +1600,8 @@
           (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v8f32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF32X4Z256rm addr:$src)>;
 def : Pat<(v4i64 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
@@ -3766,6 +3836,110 @@
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>;
+  def : Pat<(v32f16 (alignedloadv32f16 addr:$src)),
+            (VMOVAPSZrm addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (loadv32f16 addr:$src)),
+            (VMOVUPSZrm addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (vselect VK32WM:$mask,
+                     (v32f16 (loadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, (v32f16 VR512:$src0))),
+            (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, undef)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+  def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)),
+            (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
+            (VMOVAPSZmr addr:$dst, VR512:$src)>;
+  def : Pat<(store (v32f16 VR512:$src), addr:$dst),
+            (VMOVUPSZmr addr:$dst, VR512:$src)>;
+  def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
+            (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
+}
+let Predicates = [HasFP16, HasVLX] in {
+  def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>;
+  def : Pat<(v16f16 (alignedloadv16f16 addr:$src)),
+            (VMOVAPSZ256rm addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (loadv16f16 addr:$src)),
+            (VMOVUPSZ256rm addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (vselect VK16WM:$mask,
+                     (v16f16 (loadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, (v16f16 VR256X:$src0))),
+            (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, undef)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+  def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
+            (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
+            (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask),
+            (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>;
+  def : Pat<(v8f16 (alignedloadv8f16 addr:$src)),
+            (VMOVAPSZ128rm addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (loadv8f16 addr:$src)),
+            (VMOVUPSZ128rm addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (vselect VK8WM:$mask,
+                     (v8f16 (loadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, (v8f16 VR128X:$src0))),
+            (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, undef)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+  def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)),
+            (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
+
+  def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
+            (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
+            (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask),
+            (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>;
+}
 
 // Move Int Doubleword to Packed Double Int
 //
@@ -3905,12 +4079,13 @@
           (VMOV64toSDZrr (KMOVQrk VK64:$src))>;
 
 //===----------------------------------------------------------------------===//
-// AVX-512  MOVSS, MOVSD
+// AVX-512  MOVSH, MOVSS, MOVSD
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
-                              X86VectorVTInfo _> {
-  let Predicates = [HasAVX512, OptForSize] in
+                              X86VectorVTInfo _,
+                              list<Predicate> prd = [HasAVX512, OptForSize]> {
+  let Predicates = prd in
   def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
              (ins _.RC:$src1, _.RC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3976,6 +4151,9 @@
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
+defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
+                                  [HasFP16, OptForSize]>,
+                                  VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
                                        PatLeaf ZeroFP, X86VectorVTInfo _> {
@@ -4144,9 +4322,14 @@
                       addr:$srcAddr)>;
 }
 
+defm : avx512_move_scalar_lowering<"VMOVSHZ", X86Movsh, fp16imm0, v8f16x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
+defm : avx512_store_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4154,6 +4337,13 @@
 defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
 defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4179,6 +4369,10 @@
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (bitconvert (and GR32:$mask, (i32 1)))), GR32, sub_32bit>;
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
 defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -4186,6 +4380,13 @@
 defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
                    (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSHZ", avx512vl_f16_info,
+                   (v32i1 (insert_subvector
+                           (v32i1 immAllZerosV),
+                           (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                           (iPTR 0))),
+                   (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+                   GR8, sub_8bit>;
 defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (insert_subvector
                            (v16i1 immAllZerosV),
@@ -4211,6 +4412,16 @@
                           (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
                           (iPTR 0))), GR8, sub_8bit>;
 
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), (f16 FR16X:$src2))),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrk
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src2, VR128X)),
+           VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
+def : Pat<(f16 (X86selects VK1WM:$mask, (f16 FR16X:$src1), fp16imm0)),
+          (COPY_TO_REGCLASS (v8f16 (VMOVSHZrrkz VK1WM:$mask, (v8f16 (IMPLICIT_DEF)),
+           (v8f16 (COPY_TO_REGCLASS FR16X:$src1, VR128X)))), FR16X)>;
+
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (v4f32 (VMOVSSZrrk
            (v4f32 (COPY_TO_REGCLASS FR32X:$src2, VR128X)),
@@ -4259,6 +4470,32 @@
           (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
 
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
+  let Predicates = [HasFP16] in {
+    def VMOVSHZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+        []>, T_MAP5XS, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrr">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+
+    let Constraints = "$src0 = $dst" in
+    def VMOVSHZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins f16x_info.RC:$src0, f16x_info.KRCWM:$mask,
+         VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst {${mask}}|"#
+          "$dst {${mask}}, $src1, $src2}",
+        []>, T_MAP5XS, EVEX_K, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrrk">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+
+    def VMOVSHZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
+        (ins f16x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
+        "vmovsh\t{$src2, $src1, $dst {${mask}} {z}|"#
+          "$dst {${mask}} {z}, $src1, $src2}",
+        []>, EVEX_KZ, T_MAP5XS, EVEX_4V, VEX_LIG,
+        FoldGenData<"VMOVSHZrrkz">,
+        Sched<[SchedWriteFShuffle.XMM]>;
+  }
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
                            "vmovss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4311,6 +4548,16 @@
                               Sched<[SchedWriteFShuffle.XMM]>;
 }
 
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                (VMOVSHZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}}|"#
+                             "$dst {${mask}}, $src1, $src2}",
+                (VMOVSHZrrk_REV VR128X:$dst, VK1WM:$mask,
+                                VR128X:$src1, VR128X:$src2), 0>;
+def : InstAlias<"vmovsh.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                             "$dst {${mask}} {z}, $src1, $src2}",
+                (VMOVSHZrrkz_REV VR128X:$dst, VK1WM:$mask,
+                                 VR128X:$src1, VR128X:$src2), 0>;
 def : InstAlias<"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                 (VMOVSSZrr_REV VR128X:$dst, VR128X:$src1, VR128X:$src2), 0>;
 def : InstAlias<"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
@@ -4393,6 +4640,29 @@
   def : Pat<(v8f64 (X86vzload64 addr:$src)),
             (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
 }
+let Predicates = [HasFP16] in {
+  def : Pat<(v8f16 (X86vzmovl (v8f16 VR128X:$src))),
+            (VMOVSHZrr (v8f16 (AVX512_128_SET0)), VR128X:$src)>;
+
+  // FIXME we need better canonicalization in dag combine
+  def : Pat<(v16f16 (X86vzmovl (v16f16 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)),
+              (v8f16 (EXTRACT_SUBREG (v16f16 VR256X:$src), sub_xmm)))), sub_xmm)>;
+  def : Pat<(v32f16 (X86vzmovl (v32f16 VR512:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (v8f16 (VMOVSHZrr (v8f16 (AVX512_128_SET0)),
+              (v8f16 (EXTRACT_SUBREG (v32f16 VR512:$src), sub_xmm)))), sub_xmm)>;
+
+  def : Pat<(v8f16 (X86vzload16 addr:$src)),
+            (VMOVSHZrm addr:$src)>;
+
+  def : Pat<(v16f16 (X86vzload16 addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>;
+
+  def : Pat<(v32f16 (X86vzload16 addr:$src)),
+            (SUBREG_TO_REG (i32 0), (VMOVSHZrm addr:$src), sub_xmm)>;
+}
 
 let ExeDomain = SSEPackedInt, SchedRW = [SchedWriteVecLogic.XMM] in {
 def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
@@ -12200,3 +12470,96 @@
 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
                                        avx512vl_f32_info, avx512vl_i32_info,
                                        HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX512FP16
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasFP16] in {
+// Move word ( r/m16) to Packed word
+def VMOVW2SHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveFromGpr]>;
+def VMOVWrm : AVX512<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i16mem:$src),
+                      "vmovw\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v8i16 (scalar_to_vector (loadi16 addr:$src))))]>,
+                      T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFLoad]>;
+
+def : Pat<(f16 (bitconvert GR16:$src)),
+          (f16 (COPY_TO_REGCLASS
+                (VMOVW2SHrr
+                 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)),
+                FR16X))>;
+def : Pat<(v8i16 (scalar_to_vector (i16 GR16:$src))),
+          (VMOVW2SHrr (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit))>;
+def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (and GR32:$src, 0xffff)))),
+          (VMOVW2SHrr GR32:$src)>;
+// FIXME: We should really find a way to improve these patterns.
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef,
+                                    (v4i32 (scalar_to_vector
+                                            (and GR32:$src, 0xffff))),
+                                    (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>;
+def : Pat<(v16i32 (X86vzmovl
+                   (insert_subvector undef,
+                                     (v4i32 (scalar_to_vector
+                                             (and GR32:$src, 0xffff))),
+                                     (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVW2SHrr GR32:$src), sub_xmm)>;
+
+def : Pat<(v8i16 (X86vzmovl (v8i16 (scalar_to_vector (i16 (trunc GR32:$src)))))),
+          (VMOVW2SHrr GR32:$src)>;
+
+// AVX 128-bit movw instruction write zeros in the high 128-bit part.
+def : Pat<(v8i16 (X86vzload16 addr:$src)),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v16i16 (X86vzload16 addr:$src)),
+          (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>;
+
+// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
+def : Pat<(v32i16 (X86vzload16 addr:$src)),
+          (SUBREG_TO_REG (i32 0), (v8i16 (VMOVWrm addr:$src)), sub_xmm)>;
+
+def : Pat<(v4i32 (scalar_to_vector (i32 (extloadi16 addr:$src)))),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (scalar_to_vector (i32 (zextloadi16 addr:$src))))),
+          (VMOVWrm addr:$src)>;
+def : Pat<(v8i32 (X86vzmovl
+                  (insert_subvector undef,
+                                    (v4i32 (scalar_to_vector
+                                            (i32 (zextloadi16 addr:$src)))),
+                                    (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>;
+def : Pat<(v16i32 (X86vzmovl
+                   (insert_subvector undef,
+                                     (v4i32 (scalar_to_vector
+                                             (i32 (zextloadi16 addr:$src)))),
+                                     (iPTR 0)))),
+          (SUBREG_TO_REG (i32 0), (VMOVWrm addr:$src), sub_xmm)>;
+
+// Move word from xmm register to r/m16
+def VMOVSH2Wrr  : AVX512<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, Sched<[WriteVecMoveToGpr]>;
+def VMOVWmr  : AVX512<0x7E, MRMDestMem, (outs),
+                       (ins i16mem:$dst, VR128X:$src),
+                       "vmovw\t{$src, $dst|$dst, $src}",
+                       [(store (i16 (extractelt (v8i16 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)]>,
+                       T_MAP5PD, EVEX, EVEX_CD8<16, CD8VT1>, Sched<[WriteFStore]>;
+
+def : Pat<(i16 (bitconvert FR16X:$src)),
+          (i16 (EXTRACT_SUBREG
+                (VMOVSH2Wrr (COPY_TO_REGCLASS FR16X:$src, VR128X)),
+                sub_16bit))>;
+def : Pat<(i16 (extractelt (v8i16 VR128X:$src), (iPTR 0))),
+          (i16 (EXTRACT_SUBREG (VMOVSH2Wrr VR128X:$src), sub_16bit))>;
+}
+
+// Allow "vmovw" to use GR64
+let hasSideEffects = 0 in {
+  def VMOVW64toSHrr : AVX512<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveFromGpr]>;
+  def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                     "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
+}
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -555,6 +555,7 @@
   let Predicates = [HasMMX] in
     defm _VR64   : CMOVrr_PSEUDO<VR64, x86mmx>;
 
+  defm _FR16X    : CMOVrr_PSEUDO<FR16X, f16>;
   let Predicates = [HasSSE1,NoAVX512] in
     defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
   let Predicates = [HasSSE2,NoAVX512] in
@@ -612,6 +613,8 @@
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v8i16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
+  def : Pat<(v8f16 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v4i32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR128X VR128X:$t, VR128X:$f, timm:$cond)>;
   def : Pat<(v4f32 (X86cmov VR128X:$t, VR128X:$f, timm:$cond, EFLAGS)),
@@ -623,6 +626,8 @@
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v16i16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
+  def : Pat<(v16f16 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
+            (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v8i32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
             (CMOV_VR256X VR256X:$t, VR256X:$f, timm:$cond)>;
   def : Pat<(v8f32 (X86cmov VR256X:$t, VR256X:$f, timm:$cond, EFLAGS)),
@@ -635,6 +640,8 @@
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v32i16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
+def : Pat<(v32f16 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
+          (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v16i32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
           (CMOV_VR512 VR512:$t, VR512:$f, timm:$cond)>;
 def : Pat<(v16f32 (X86cmov VR512:$t, VR512:$f, timm:$cond, EFLAGS)),
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -149,8 +149,8 @@
                         // disable to ANDPS.
 
 // Class specifying the opcode map.
-class Map<bits<3> val> {
-  bits<3> Value = val;
+class Map<bits<4> val> {
+  bits<4> Value = val;
 }
 def OB        : Map<0>;
 def TB        : Map<1>;
@@ -160,6 +160,8 @@
 def XOP9      : Map<5>;
 def XOPA      : Map<6>;
 def ThreeDNow : Map<7>;
+def T_MAP5    : Map<8>;
+def T_MAP6    : Map<9>;
 
 // Class specifying the encoding
 class Encoding<bits<2> val> {
@@ -204,6 +206,16 @@
 class XOP9   { Map OpMap = XOP9; Prefix OpPrefix = PS; }
 class XOPA   { Map OpMap = XOPA; Prefix OpPrefix = PS; }
 class ThreeDNow { Map OpMap = ThreeDNow; }
+class T_MAP5     { Map OpMap = T_MAP5; }
+class T_MAP5PS : T_MAP5 { Prefix OpPrefix = PS; } // none
+class T_MAP5PD : T_MAP5 { Prefix OpPrefix = PD; } // 0x66
+class T_MAP5XS : T_MAP5 { Prefix OpPrefix = XS; } // 0xF3
+class T_MAP5XD : T_MAP5 { Prefix OpPrefix = XD; } // 0xF2
+class T_MAP6     { Map OpMap = T_MAP6; }
+class T_MAP6PS : T_MAP6 { Prefix OpPrefix = PS; }
+class T_MAP6PD : T_MAP6 { Prefix OpPrefix = PD; }
+class T_MAP6XS : T_MAP6 { Prefix OpPrefix = XS; }
+class T_MAP6XD : T_MAP6 { Prefix OpPrefix = XD; }
 class OBXS   { Prefix OpPrefix = XS; }
 class PS   : TB { Prefix OpPrefix = PS; }
 class PD   : TB { Prefix OpPrefix = PD; }
@@ -301,7 +313,7 @@
   Prefix OpPrefix = NoPrfx; // Which prefix byte does this inst have?
   bits<3> OpPrefixBits = OpPrefix.Value;
   Map OpMap = OB;           // Which opcode map does this inst have?
-  bits<3> OpMapBits = OpMap.Value;
+  bits<4> OpMapBits = OpMap.Value;
   bit hasREX_WPrefix  = 0;  // Does this inst require the REX.W prefix?
   FPFormat FPForm = NotFP;  // What flavor of FP instruction is this?
   bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
@@ -360,28 +372,28 @@
   let TSFlags{10-9}  = AdSizeBits;
   // No need for 3rd bit, we don't need to distinguish NoPrfx from PS.
   let TSFlags{12-11} = OpPrefixBits{1-0};
-  let TSFlags{15-13} = OpMapBits;
-  let TSFlags{16}    = hasREX_WPrefix;
-  let TSFlags{20-17} = ImmT.Value;
-  let TSFlags{23-21} = FPForm.Value;
-  let TSFlags{24}    = hasLockPrefix;
-  let TSFlags{25}    = hasREPPrefix;
-  let TSFlags{27-26} = ExeDomain.Value;
-  let TSFlags{29-28} = OpEncBits;
-  let TSFlags{37-30} = Opcode;
+  let TSFlags{16-13} = OpMapBits;
+  let TSFlags{17}    = hasREX_WPrefix;
+  let TSFlags{21-18} = ImmT.Value;
+  let TSFlags{24-22} = FPForm.Value;
+  let TSFlags{25}    = hasLockPrefix;
+  let TSFlags{26}    = hasREPPrefix;
+  let TSFlags{28-27} = ExeDomain.Value;
+  let TSFlags{30-29} = OpEncBits;
+  let TSFlags{38-31} = Opcode;
   // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
-  let TSFlags{38}    = HasVEX_W;
-  let TSFlags{39}    = hasVEX_4V;
-  let TSFlags{40}    = hasVEX_L;
-  let TSFlags{41}    = hasEVEX_K;
-  let TSFlags{42}    = hasEVEX_Z;
-  let TSFlags{43}    = hasEVEX_L2;
-  let TSFlags{44}    = hasEVEX_B;
+  let TSFlags{39}    = HasVEX_W;
+  let TSFlags{40}    = hasVEX_4V;
+  let TSFlags{41}    = hasVEX_L;
+  let TSFlags{42}    = hasEVEX_K;
+  let TSFlags{43}    = hasEVEX_Z;
+  let TSFlags{44}    = hasEVEX_L2;
+  let TSFlags{45}    = hasEVEX_B;
   // If we run out of TSFlags bits, it's possible to encode this in 3 bits.
-  let TSFlags{51-45} = CD8_Scale;
-  let TSFlags{52}    = hasEVEX_RC;
-  let TSFlags{53}    = hasNoTrackPrefix;
-  let TSFlags{54}    = ExplicitVEXPrefix;
+  let TSFlags{52-46} = CD8_Scale;
+  let TSFlags{53}    = hasEVEX_RC;
+  let TSFlags{54}    = hasNoTrackPrefix;
+  let TSFlags{55}    = ExplicitVEXPrefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -417,6 +417,11 @@
                                            SDTCisVT<1, v4f32>,
                                            SDTCisVT<2, v4f32>]>>;
 
+def X86Movsh : SDNode<"X86ISD::MOVSH",
+                      SDTypeProfile<1, 2, [SDTCisVT<0, v8f16>,
+                                           SDTCisVT<1, v8f16>,
+                                           SDTCisVT<2, v8f16>]>>;
+
 def X86Movlhps : SDNode<"X86ISD::MOVLHPS",
                         SDTypeProfile<1, 2, [SDTCisVT<0, v4f32>,
                                              SDTCisVT<1, v4f32>,
@@ -796,6 +801,7 @@
 //===----------------------------------------------------------------------===//
 
 // 128-bit load pattern fragments
+def loadv8f16    : PatFrag<(ops node:$ptr), (v8f16 (load node:$ptr))>;
 def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
 def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
 def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
@@ -804,6 +810,7 @@
 def loadv16i8    : PatFrag<(ops node:$ptr), (v16i8 (load node:$ptr))>;
 
 // 256-bit load pattern fragments
+def loadv16f16   : PatFrag<(ops node:$ptr), (v16f16 (load node:$ptr))>;
 def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32  (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64  (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64  (load node:$ptr))>;
@@ -812,6 +819,7 @@
 def loadv32i8    : PatFrag<(ops node:$ptr), (v32i8  (load node:$ptr))>;
 
 // 512-bit load pattern fragments
+def loadv32f16   : PatFrag<(ops node:$ptr), (v32f16 (load node:$ptr))>;
 def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
 def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64  (load node:$ptr))>;
 def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64  (load node:$ptr))>;
@@ -823,6 +831,10 @@
 def extloadv2f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
 def extloadv8f32 : PatFrag<(ops node:$ptr), (extloadvf32 node:$ptr)>;
+def extloadv2f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv4f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv8f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
+def extloadv16f16 : PatFrag<(ops node:$ptr), (extloadvf16 node:$ptr)>;
 
 // Like 'store', but always requires vector size alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -839,6 +851,8 @@
 
 // 128-bit aligned load pattern fragments
 // NOTE: all 128-bit integer vector loads are promoted to v2i64
+def alignedloadv8f16 : PatFrag<(ops node:$ptr),
+                               (v8f16 (alignedload node:$ptr))>;
 def alignedloadv4f32 : PatFrag<(ops node:$ptr),
                                (v4f32 (alignedload node:$ptr))>;
 def alignedloadv2f64 : PatFrag<(ops node:$ptr),
@@ -854,6 +868,8 @@
 
 // 256-bit aligned load pattern fragments
 // NOTE: all 256-bit integer vector loads are promoted to v4i64
+def alignedloadv16f16 : PatFrag<(ops node:$ptr),
+                                (v16f16 (alignedload node:$ptr))>;
 def alignedloadv8f32  : PatFrag<(ops node:$ptr),
                                 (v8f32  (alignedload node:$ptr))>;
 def alignedloadv4f64  : PatFrag<(ops node:$ptr),
@@ -868,6 +884,8 @@
                                 (v32i8  (alignedload node:$ptr))>;
 
 // 512-bit aligned load pattern fragments
+def alignedloadv32f16 : PatFrag<(ops node:$ptr),
+                                (v32f16 (alignedload node:$ptr))>;
 def alignedloadv16f32 : PatFrag<(ops node:$ptr),
                                 (v16f32 (alignedload node:$ptr))>;
 def alignedloadv8f64  : PatFrag<(ops node:$ptr),
@@ -926,6 +944,11 @@
 def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
 def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
 
+def X86vzload16 : PatFrag<(ops node:$src),
+                          (X86vzld node:$src), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 2;
+}]>;
+
 def X86vzload32 : PatFrag<(ops node:$src),
                           (X86vzld node:$src), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
@@ -976,6 +999,10 @@
 // only load a single element.
 // FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
 // the simple_load case.
+def sse_load_f16 : PatFrags<(ops node:$ptr),
+                            [(v8f16 (simple_load node:$ptr)),
+                             (v8f16 (X86vzload16 node:$ptr)),
+                             (v8f16 (scalar_to_vector (loadf16 node:$ptr)))]>;
 def sse_load_f32 : PatFrags<(ops node:$ptr),
                             [(v4f32 (simple_load node:$ptr)),
                              (v4f32 (X86vzload32 node:$ptr)),
@@ -985,9 +1012,13 @@
                              (v2f64 (X86vzload64 node:$ptr)),
                              (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
 
+def shmem : X86MemOperand<"printwordmem", X86Mem16AsmOperand>;
 def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
 def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
 
+def fp16imm0 : PatLeaf<(f16 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
 
 def fp32imm0 : PatLeaf<(f32 fpimm), [{
   return N->isExactlyValue(+0.0);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -699,6 +699,8 @@
     return true;
   case X86::MOV16rm:
   case X86::KMOVWkm:
+  case X86::VMOVSHZrm:
+  case X86::VMOVSHZrm_alt:
     MemBytes = 2;
     return true;
   case X86::MOV32rm:
@@ -795,6 +797,7 @@
     return true;
   case X86::MOV16mr:
   case X86::KMOVWmk:
+  case X86::VMOVSHZmr:
     MemBytes = 2;
     return true;
   case X86::MOV32mr:
@@ -980,6 +983,7 @@
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
   case X86::AVX512_FsFLD0SD:
+  case X86::AVX512_FsFLD0SH:
   case X86::AVX512_FsFLD0SS:
   case X86::AVX512_FsFLD0F128:
   case X86::AVX_SET0:
@@ -1047,6 +1051,8 @@
   case X86::VMOVSSZrm_alt:
   case X86::VMOVSDZrm:
   case X86::VMOVSDZrm_alt:
+  case X86::VMOVSHZrm:
+  case X86::VMOVSHZrm_alt:
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPDZrm:
@@ -3605,6 +3611,10 @@
   case 2:
     if (X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (X86::FR16XRegClass.hasSubClassEq(RC)) {
+      assert(STI.hasFP16());
+      return load ? X86::VMOVSHZrm_alt : X86::VMOVSHZmr;
+    }
     assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
     return load ? X86::MOV16rm : X86::MOV16mr;
   case 4:
@@ -4755,6 +4765,7 @@
     return true;
   }
   case X86::AVX512_128_SET0:
+  case X86::AVX512_FsFLD0SH:
   case X86::AVX512_FsFLD0SS:
   case X86::AVX512_FsFLD0SD:
   case X86::AVX512_FsFLD0F128: {
@@ -6101,6 +6112,9 @@
     case X86::AVX512_FsFLD0SS:
       Alignment = Align(4);
       break;
+    case X86::AVX512_FsFLD0SH:
+      Alignment = Align(2);
+      break;
     default:
       return nullptr;
     }
@@ -6136,6 +6150,7 @@
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0:
   case X86::AVX512_512_SETALLONES:
+  case X86::AVX512_FsFLD0SH:
   case X86::FsFLD0SD:
   case X86::AVX512_FsFLD0SD:
   case X86::FsFLD0SS:
@@ -6174,6 +6189,8 @@
       Ty = Type::getDoubleTy(MF.getFunction().getContext());
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
+    else if (Opc == X86::AVX512_FsFLD0SH)
+      Ty = Type::getHalfTy(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
       Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
                                 16);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -421,6 +421,7 @@
 def i128mem : X86MemOperand<"printxmmwordmem", X86Mem128AsmOperand>;
 def i256mem : X86MemOperand<"printymmwordmem", X86Mem256AsmOperand>;
 def i512mem : X86MemOperand<"printzmmwordmem", X86Mem512AsmOperand>;
+def f16mem  : X86MemOperand<"printwordmem",   X86Mem16AsmOperand>;
 def f32mem  : X86MemOperand<"printdwordmem",  X86Mem32AsmOperand>;
 def f64mem  : X86MemOperand<"printqwordmem",  X86Mem64AsmOperand>;
 def f80mem  : X86MemOperand<"printtbytemem",  X86Mem80AsmOperand>;
@@ -919,6 +920,7 @@
 def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
+def HasFP16      : Predicate<"Subtarget->hasFP16()">;
 def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
 def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
 
@@ -1193,6 +1195,7 @@
 }]>;
 
 def loadi64  : PatFrag<(ops node:$ptr), (i64 (load node:$ptr))>;
+def loadf16  : PatFrag<(ops node:$ptr), (f16 (load node:$ptr))>;
 def loadf32  : PatFrag<(ops node:$ptr), (f32 (load node:$ptr))>;
 def loadf64  : PatFrag<(ops node:$ptr), (f64 (load node:$ptr))>;
 def loadf80  : PatFrag<(ops node:$ptr), (f80 (load node:$ptr))>;
diff --git a/llvm/lib/Target/X86/X86InstrVecCompiler.td b/llvm/lib/Target/X86/X86InstrVecCompiler.td
--- a/llvm/lib/Target/X86/X86InstrVecCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrVecCompiler.td
@@ -25,6 +25,8 @@
 
 let Predicates = [HasAVX512] in {
   // A vector extract of the first f32/f64 position is a subregister copy
+  def : Pat<(f16 (extractelt (v8f16 VR128X:$src), (iPTR 0))),
+            (COPY_TO_REGCLASS (v8f16 VR128X:$src), FR16X)>;
   def : Pat<(f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
             (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X)>;
   def : Pat<(f64 (extractelt (v2f64 VR128X:$src), (iPTR 0))),
@@ -32,6 +34,8 @@
 }
 
 let Predicates = [NoVLX] in {
+  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
+            (COPY_TO_REGCLASS FR16X:$src, VR128)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
             (COPY_TO_REGCLASS FR32:$src, VR128)>;
@@ -41,6 +45,8 @@
 }
 
 let Predicates = [HasVLX] in {
+  def : Pat<(v8f16 (scalar_to_vector FR16X:$src)),
+            (COPY_TO_REGCLASS FR16X:$src, VR128X)>;
   // Implicitly promote a 32-bit scalar to a vector.
   def : Pat<(v4f32 (scalar_to_vector FR32X:$src)),
             (COPY_TO_REGCLASS FR32X:$src, VR128X)>;
@@ -74,6 +80,7 @@
 defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8,  sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8f16, VR256, v16f16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -85,6 +92,7 @@
 defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64,  sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
 defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8,  sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8f16, VR512, v32f16, sub_xmm>;
 
 // A 128-bit subvector extract from the first 512-bit vector position is a
 // subregister copy that needs no instruction. Likewise, a 128-bit subvector
@@ -96,6 +104,7 @@
 defm : subvector_subreg_lowering<VR256, v4f64,  VR512, v8f64,  sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
 defm : subvector_subreg_lowering<VR256, v32i8,  VR512, v64i8,  sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16f16, VR512, v32f16, sub_ymm>;
 
 
 // If we're inserting into an all zeros vector, just use a plain move which
@@ -159,6 +168,12 @@
   defm : subvec_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, sub_ymm>;
 }
 
+let Predicates = [HasFP16, HasVLX] in {
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v16f16, v8f16, v8i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ128", VR128X, v32f16, v8f16, v16i32, sub_xmm>;
+  defm : subvec_zero_lowering<"APSZ256", VR256X, v32f16, v16f16, v16i32, sub_ymm>;
+}
+
 class maskzeroupper<ValueType vt, RegisterClass RC> :
   PatLeaf<(vt RC:$src), [{
     return isMaskZeroExtended(N);
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -567,9 +567,9 @@
 // Generic vector registers: VR64 and VR128.
 // Ensure that float types are declared first - only float is legal on SSE1.
 def VR64: RegisterClass<"X86", [x86mmx], 64, (sequence "MM%u", 0, 7)>;
-def VR128 : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+def VR128 : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
                           128, (add FR32)>;
-def VR256 : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+def VR256 : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64],
                           256, (sequence "YMM%u", 0, 15)>;
 
 // Status flags registers.
@@ -587,7 +587,7 @@
 }
 
 // AVX-512 vector/mask registers.
-def VR512 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i64],
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v32f16, v64i8, v32i16, v16i32, v8i64],
                           512, (sequence "ZMM%u", 0, 31)>;
 
 // Represents the lower 16 registers that have VEX/legacy encodable subregs.
@@ -599,10 +599,12 @@
 
 def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
 
+def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)>;
+
 // Extended VR128 and VR256 for AVX-512 instructions
-def VR128X : RegisterClass<"X86", [v4f32, v2f64, v16i8, v8i16, v4i32, v2i64, f128],
+def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v16i8, v8i16, v4i32, v2i64, f128],
                            128, (add FR32X)>;
-def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64],
+def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v32i8, v16i16, v8i32, v4i64],
                            256, (sequence "YMM%u", 0, 31)>;
 
 // Mask registers
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -87,8 +87,10 @@
 }
 
 // Multiclass that wraps X86SchedWriteWidths for each fp vector type.
-class X86SchedWriteSizes<X86SchedWriteWidths sPS,
+class X86SchedWriteSizes<X86SchedWriteWidths sPH,
+                         X86SchedWriteWidths sPS,
                          X86SchedWriteWidths sPD> {
+  X86SchedWriteWidths PH = sPH;
   X86SchedWriteWidths PS = sPS;
   X86SchedWriteWidths PD = sPD;
 }
@@ -681,20 +683,22 @@
                        WriteVarBlendY, WriteVarBlendZ>;
 
 // Vector size wrappers.
+// FIXME: Currently PH uses the same schedule method as PS.
+// We may refine them later.
 def SchedWriteFAddSizes
- : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd64>;
+ : X86SchedWriteSizes<SchedWriteFAdd, SchedWriteFAdd, SchedWriteFAdd64>;
 def SchedWriteFCmpSizes
- : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp64>;
+ : X86SchedWriteSizes<SchedWriteFCmp, SchedWriteFCmp, SchedWriteFCmp64>;
 def SchedWriteFMulSizes
- : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul64>;
+ : X86SchedWriteSizes<SchedWriteFMul, SchedWriteFMul, SchedWriteFMul64>;
 def SchedWriteFDivSizes
- : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv64>;
+ : X86SchedWriteSizes<SchedWriteFDiv, SchedWriteFDiv, SchedWriteFDiv64>;
 def SchedWriteFSqrtSizes
- : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt64>;
+ : X86SchedWriteSizes<SchedWriteFSqrt, SchedWriteFSqrt, SchedWriteFSqrt64>;
 def SchedWriteFLogicSizes
- : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic>;
+ : X86SchedWriteSizes<SchedWriteFLogic, SchedWriteFLogic, SchedWriteFLogic>;
 def SchedWriteFShuffleSizes
- : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle>;
+ : X86SchedWriteSizes<SchedWriteFShuffle, SchedWriteFShuffle, SchedWriteFShuffle>;
 
 //===----------------------------------------------------------------------===//
 // Generic Processor Scheduler Models.
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -353,6 +353,9 @@
   /// Processor has AVX-512 Vector Length eXtenstions
   bool HasVLX = false;
 
+  /// Processor has AVX-512 16 bit floating-point extenstions
+  bool HasFP16 = false;
+
   /// Processor has PKU extenstions
   bool HasPKU = false;
 
@@ -742,6 +745,7 @@
   bool hasDQI() const { return HasDQI; }
   bool hasBWI() const { return HasBWI; }
   bool hasVLX() const { return HasVLX; }
+  bool hasFP16() const { return HasFP16; }
   bool hasPKU() const { return HasPKU; }
   bool hasVNNI() const { return HasVNNI; }
   bool hasBF16() const { return HasBF16; }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1199,6 +1199,29 @@
     LT.first = NumOfDests * NumOfShufflesPerDest;
   }
 
+  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
+      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v8f16, 1},  // vpbroadcastw
+
+      {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v8f16, 1},  // vpshufb
+
+      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1},  // vpshufb
+
+      {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2}   // vpermt2w
+  };
+
+  if (!ST->useSoftFloat() && ST->hasFP16())
+    if (const auto *Entry =
+            CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
@@ -4693,6 +4716,9 @@
   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
     return true;
 
+  if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
+    return true;
+
   if (!ScalarTy->isIntegerTy())
     return false;
 
@@ -5150,12 +5176,13 @@
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
     bool UseMaskForCond, bool UseMaskForGaps) {
-  auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
+  auto isSupportedOnAVX512 = [&](Type *VecTy, bool HasBW) {
     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
       return true;
-    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
+        (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
       return HasBW;
     return false;
   };
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-half.ll
@@ -0,0 +1,140 @@
+; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mattr=avx512fp16 %s 2>&1 | FileCheck %s
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+@src = common local_unnamed_addr global [120 x half] zeroinitializer, align 4
+@dst = common local_unnamed_addr global [120 x half] zeroinitializer, align 4
+
+; Function Attrs: norecurse nounwind
+define void @stride8(half %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 148 for VF 32 For instruction:   %0 = load half
+
+  %cmp72 = icmp sgt i32 %width_, 0
+  br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
+  %arrayidx = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %i.073
+  %0 = load half, half* %arrayidx, align 4
+  %mul = fmul fast half %0, %k
+  %arrayidx2 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %i.073
+  %1 = load half, half* %arrayidx2, align 4
+  %add3 = fadd fast half %1, %mul
+  store half %add3, half* %arrayidx2, align 4
+  %add4 = or i32 %i.073, 1
+  %arrayidx5 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add4
+  %2 = load half, half* %arrayidx5, align 4
+  %mul6 = fmul fast half %2, %k
+  %arrayidx8 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add4
+  %3 = load half, half* %arrayidx8, align 4
+  %add9 = fadd fast half %3, %mul6
+  store half %add9, half* %arrayidx8, align 4
+  %add10 = or i32 %i.073, 2
+  %arrayidx11 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add10
+  %4 = load half, half* %arrayidx11, align 4
+  %mul12 = fmul fast half %4, %k
+  %arrayidx14 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add10
+  %5 = load half, half* %arrayidx14, align 4
+  %add15 = fadd fast half %5, %mul12
+  store half %add15, half* %arrayidx14, align 4
+  %add16 = or i32 %i.073, 3
+  %arrayidx17 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add16
+  %6 = load half, half* %arrayidx17, align 4
+  %mul18 = fmul fast half %6, %k
+  %arrayidx20 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add16
+  %7 = load half, half* %arrayidx20, align 4
+  %add21 = fadd fast half %7, %mul18
+  store half %add21, half* %arrayidx20, align 4
+  %add22 = or i32 %i.073, 4
+  %arrayidx23 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add22
+  %8 = load half, half* %arrayidx23, align 4
+  %mul24 = fmul fast half %8, %k
+  %arrayidx26 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add22
+  %9 = load half, half* %arrayidx26, align 4
+  %add27 = fadd fast half %9, %mul24
+  store half %add27, half* %arrayidx26, align 4
+  %add28 = or i32 %i.073, 5
+  %arrayidx29 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add28
+  %10 = load half, half* %arrayidx29, align 4
+  %mul30 = fmul fast half %10, %k
+  %arrayidx32 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add28
+  %11 = load half, half* %arrayidx32, align 4
+  %add33 = fadd fast half %11, %mul30
+  store half %add33, half* %arrayidx32, align 4
+  %add34 = or i32 %i.073, 6
+  %arrayidx35 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add34
+  %12 = load half, half* %arrayidx35, align 4
+  %mul36 = fmul fast half %12, %k
+  %arrayidx38 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add34
+  %13 = load half, half* %arrayidx38, align 4
+  %add39 = fadd fast half %13, %mul36
+  store half %add39, half* %arrayidx38, align 4
+  %add40 = or i32 %i.073, 7
+  %arrayidx41 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add40
+  %14 = load half, half* %arrayidx41, align 4
+  %mul42 = fmul fast half %14, %k
+  %arrayidx44 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add40
+  %15 = load half, half* %arrayidx44, align 4
+  %add45 = fadd fast half %15, %mul42
+  store half %add45, half* %arrayidx44, align 4
+  %add46 = add nuw nsw i32 %i.073, 8
+  %cmp = icmp slt i32 %add46, %width_
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Function Attrs: norecurse nounwind
+define void @stride3(half %k, i32 %width_) {
+entry:
+
+; CHECK: Found an estimated cost of 18 for VF 32 For instruction:   %0 = load half
+
+  %cmp27 = icmp sgt i32 %width_, 0
+  br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
+  %arrayidx = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %i.028
+  %0 = load half, half* %arrayidx, align 4
+  %mul = fmul fast half %0, %k
+  %arrayidx2 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %i.028
+  %1 = load half, half* %arrayidx2, align 4
+  %add3 = fadd fast half %1, %mul
+  store half %add3, half* %arrayidx2, align 4
+  %add4 = add nuw nsw i32 %i.028, 1
+  %arrayidx5 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add4
+  %2 = load half, half* %arrayidx5, align 4
+  %mul6 = fmul fast half %2, %k
+  %arrayidx8 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add4
+  %3 = load half, half* %arrayidx8, align 4
+  %add9 = fadd fast half %3, %mul6
+  store half %add9, half* %arrayidx8, align 4
+  %add10 = add nuw nsw i32 %i.028, 2
+  %arrayidx11 = getelementptr inbounds [120 x half], [120 x half]* @src, i32 0, i32 %add10
+  %4 = load half, half* %arrayidx11, align 4
+  %mul12 = fmul fast half %4, %k
+  %arrayidx14 = getelementptr inbounds [120 x half], [120 x half]* @dst, i32 0, i32 %add10
+  %5 = load half, half* %arrayidx14, align 4
+  %add15 = fadd fast half %5, %mul12
+  store half %add15, half* %arrayidx14, align 4
+  %add16 = add nuw nsw i32 %i.028, 3
+  %cmp = icmp slt i32 %add16, %width_
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-broadcast-fp16.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s
+
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; CHECK-LABEL: 'test_vXf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> zeroinitializer
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> zeroinitializer
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> zeroinitializer
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> zeroinitializer
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> zeroinitializer
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-reverse-fp16.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s
+
+define void @test_vXf16(<2 x half> %src32, <4 x half> %src64, <8 x half> %src128, <16 x half> %src256, <32 x half> %src512) {
+; CHECK-LABEL: 'test_vXf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32 = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V32  = shufflevector <2 x half> %src32, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+  %V64  = shufflevector <4 x half> %src64, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-single-src-fp16.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s
+
+define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024) {
+; CHECK-LABEL: 'test_vXf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <8 x half> %src128, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <16 x half> %src256, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <32 x half> %src512, <32 x half> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <64 x half> %src1024, <64 x half> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512fp16 | FileCheck %s
+
+define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %src512, <64 x half> %src1024, <8 x half> %src128_1, <16 x half> %src256_1, <32 x half> %src512_1, <64 x half> %src1024_1) {
+; CHECK-LABEL: 'test_vXf16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
+  %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+  ret void
+}
diff --git a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
--- a/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
+++ b/llvm/test/CodeGen/MIR/X86/inline-asm-registers.mir
@@ -28,8 +28,8 @@
     liveins: $rdi, $rsi
 
   ; CHECK-LABEL: name: test
-  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi,
-    INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
+  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi,
+    INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi, 2147483657, killed $rsi, 12, implicit-def dead early-clobber $eflags
     $rax = MOV64rr killed $rsi
     RETQ killed $rax
 ...
@@ -45,8 +45,8 @@
 
   ; Verify that the register ties are preserved.
   ; CHECK-LABEL: name: test2
-  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4325386 /* regdef:GR64 */, def $rsi, 4325386 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
-    INLINEASM &foo, 0, 4325386, def $rsi, 4325386, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
+  ; CHECK: INLINEASM &foo, 0 /* attdialect */, 4390922 /* regdef:GR64 */, def $rsi, 4390922 /* regdef:GR64 */, def dead $rdi, 2147549193 /* reguse tiedto:$1 */, killed $rdi(tied-def 5), 2147483657 /* reguse tiedto:$0 */, killed $rsi(tied-def 3), 12 /* clobber */, implicit-def dead early-clobber $eflags
+    INLINEASM &foo, 0, 4390922, def $rsi, 4390922, def dead $rdi, 2147549193, killed $rdi(tied-def 5), 2147483657, killed $rsi(tied-def 3), 12, implicit-def dead early-clobber $eflags
     $rax = MOV64rr killed $rsi
     RETQ killed $rax
 ...
diff --git a/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll b/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-insert-extract.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=CHECK
+
+define <8 x half> @extract_v16f16_v8f16_0(<16 x half> %x) {
+; CHECK-LABEL: extract_v16f16_v8f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <16 x half> %x, <16 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %a
+}
+
+define <8 x half> @extract_v16f16_v8f16_1(<16 x half> %x) {
+; CHECK-LABEL: extract_v16f16_v8f16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <16 x half> %x, <16 x half> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x half> %a
+}
+
+define <8 x half> @extract_v32f16_v8f16_0(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v8f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x half> %a
+}
+
+define <8 x half> @extract_v32f16_v8f16_1(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v8f16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x half> %a
+}
+
+define <8 x half> @extract_v32f16_v8f16_2(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v8f16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <8 x half> %a
+}
+
+define <8 x half> @extract_v32f16_v8f16_3(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v8f16_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <8 x half> %a
+}
+
+define <16 x half> @extract_v32f16_v81616_0(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v81616_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x half> %a
+}
+
+define <16 x half> @extract_v32f16_v81616_1(<32 x half> %x) {
+; CHECK-LABEL: extract_v32f16_v81616_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <32 x half> %x, <32 x half> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x half> %a
+}
+
+define <16 x half> @concat_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: concat_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %x, <8 x half> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x half> %a
+}
+
+define <32 x half> @concat_v16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: concat_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <16 x half> %x, <16 x half> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x half> %a
+}
+
+define <16 x half> @concat_zero_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: concat_zero_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %x, <8 x half> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x half> %a
+}
+
+define <32 x half> @concat_zero_v16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: concat_zero_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <16 x half> %x, <16 x half> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x half> %a
+}
+
+define <32 x half> @insert_v8f16_v32f16_0(<32 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: insert_v8f16_v32f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x half> %b
+}
+
+define <32 x half> @insert_v8f16_v32f16_1(<32 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: insert_v8f16_v32f16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x half> %b
+}
+
+define <32 x half> @insert_v8f16_v32f16_2(<32 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: insert_v8f16_v32f16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x half> %b
+}
+
+define <32 x half> @insert_v8f16_v32f16_3(<32 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: insert_v8f16_v32f16_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = shufflevector <8 x half> %y, <8 x half> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %b = shufflevector <32 x half> %x, <32 x half> %a, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+  ret <32 x half> %b
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -0,0 +1,1887 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 | FileCheck %s --check-prefixes=CHECK,X86
+
+define <8 x half> @broadcastph128(half* %x) {
+; X64-LABEL: broadcastph128:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph128:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpbroadcastw (%eax), %xmm0
+; X86-NEXT:    retl
+  %l1 = load half, half* %x, align 2
+  %vec = insertelement <8 x half> undef, half %l1, i32 0
+  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <16 x half> @broadcastph256(half* %x) {
+; X64-LABEL: broadcastph256:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw (%rdi), %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph256:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpbroadcastw (%eax), %ymm0
+; X86-NEXT:    retl
+  %l1 = load half, half* %x, align 2
+  %vec = insertelement <16 x half> undef, half %l1, i32 0
+  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <32 x half> @broadcastph512(half* %x) {
+; X64-LABEL: broadcastph512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw (%rdi), %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph512:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpbroadcastw (%eax), %zmm0
+; X86-NEXT:    retl
+  %l1 = load half, half* %x, align 2
+  %vec = insertelement <32 x half> undef, half %l1, i32 0
+  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <8 x half> @broadcastph128_scalar(half %x) {
+; X64-LABEL: broadcastph128_scalar:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph128_scalar:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+  %vec = insertelement <8 x half> undef, half %x, i32 0
+  %res = shufflevector <8 x half> %vec, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <16 x half> @broadcastph256_scalar(half %x) {
+; X64-LABEL: broadcastph256_scalar:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw %xmm0, %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph256_scalar:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
+; X86-NEXT:    retl
+  %vec = insertelement <16 x half> undef, half %x, i32 0
+  %res = shufflevector <16 x half> %vec, <16 x half> undef, <16 x i32> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <32 x half> @broadcastph512_scalar(half %x) {
+; X64-LABEL: broadcastph512_scalar:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw %xmm0, %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: broadcastph512_scalar:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
+; X86-NEXT:    retl
+  %vec = insertelement <32 x half> undef, half %x, i32 0
+  %res = shufflevector <32 x half> %vec, <32 x half> undef, <32 x i32> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <8 x half> @broadcastph128_reg(<8 x half> %x) {
+; CHECK-LABEL: broadcastph128_reg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = shufflevector <8 x half> %x, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <16 x half> @broadcastph256_reg(<16 x half> %x) {
+; CHECK-LABEL: broadcastph256_reg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = shufflevector <16 x half> %x, <16 x half> undef, <16 x i32> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <32 x half> @broadcastph512_reg(<32 x half> %x) {
+; CHECK-LABEL: broadcastph512_reg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = shufflevector <32 x half> %x, <32 x half> undef, <32 x i32> zeroinitializer
+  ret <32 x half> %res
+}
+
+define i16 @test1(half %x) {
+; X64-LABEL: test1:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test1:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+   %res = bitcast half %x to i16
+   ret i16 %res
+}
+
+define <8 x i16> @test2(i16 %x) {
+; X64-LABEL: test2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test2:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <8 x i16>undef, i16 %x, i32 0
+   ret <8 x i16>%res
+}
+
+define <8 x i16> @test4(i16* %x) {
+; X64-LABEL: test4:
+; X64:       # %bb.0:
+; X64-NEXT:    vpbroadcastw (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpbroadcastw (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load i16, i16* %x
+   %res = insertelement <8 x i16>undef, i16 %y, i32 0
+   ret <8 x i16>%res
+}
+
+define void @test5(half %x, half* %y) {
+; X64-LABEL: test5:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: test5:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   store half %x, half* %y, align 2
+   ret void
+}
+
+define half @test7(i16* %x) {
+; X64-LABEL: test7:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test7:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load i16, i16* %x
+   %res = bitcast i16 %y to half
+   ret half %res
+}
+
+define <8 x i16> @test10(i16* %x) {
+; X64-LABEL: test10:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test10:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovw (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load i16, i16* %x, align 2
+   %res = insertelement <8 x i16>zeroinitializer, i16 %y, i32 0
+   ret <8 x i16>%res
+}
+
+define <16 x i16> @test10b(i16* %x) {
+; X64-LABEL: test10b:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test10b:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovw (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load i16, i16* %x, align 2
+   %res = insertelement <16 x i16>zeroinitializer, i16 %y, i32 0
+   ret <16 x i16>%res
+}
+
+define <32 x i16> @test10c(i16* %x) {
+; X64-LABEL: test10c:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test10c:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovw (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load i16, i16* %x, align 2
+   %res = insertelement <32 x i16>zeroinitializer, i16 %y, i32 0
+   ret <32 x i16>%res
+}
+
+define <8 x half> @test11(half* %x) {
+; X64-LABEL: test11:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test11:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load half, half* %x, align 2
+   %res = insertelement <8 x half>zeroinitializer, half %y, i32 0
+   ret <8 x half>%res
+}
+
+define <16 x half> @test11b(half* %x) {
+; X64-LABEL: test11b:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test11b:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load half, half* %x, align 2
+   %res = insertelement <16 x half>zeroinitializer, half %y, i32 0
+   ret <16 x half>%res
+}
+
+define <32 x half> @test11c(half* %x) {
+; X64-LABEL: test11c:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test11c:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    retl
+   %y = load half, half* %x, align 2
+   %res = insertelement <32 x half>zeroinitializer, half %y, i32 0
+   ret <32 x half>%res
+}
+
+define <8 x half> @test14(half %x) {
+; X64-LABEL: test14:
+; X64:       # %bb.0:
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test14:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <8 x half>zeroinitializer, half %x, i32 0
+   ret <8 x half>%res
+}
+
+define <16 x half> @test14b(half %x) {
+; X64-LABEL: test14b:
+; X64:       # %bb.0:
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test14b:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <16 x half>zeroinitializer, half %x, i32 0
+   ret <16 x half>%res
+}
+
+define <32 x half> @test14c(half %x) {
+; X64-LABEL: test14c:
+; X64:       # %bb.0:
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test14c:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <32 x half>zeroinitializer, half %x, i32 0
+   ret <32 x half>%res
+}
+
+define <8 x i16> @test15(i16 %x) {
+; X64-LABEL: test15:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test15:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <8 x i16>zeroinitializer, i16 %x, i32 0
+   ret <8 x i16>%res
+}
+
+define <16 x i16> @test16(i16 %x) {
+; X64-LABEL: test16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <16 x i16>zeroinitializer, i16 %x, i32 0
+   ret <16 x i16>%res
+}
+
+define <32 x i16> @test17(i16 %x) {
+; X64-LABEL: test17:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test17:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <32 x i16>zeroinitializer, i16 %x, i32 0
+   ret <32 x i16>%res
+}
+
+define <8 x i16> @test18(i16 %x) {
+; X64-LABEL: test18:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test18:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+   %res = insertelement <8 x i16> undef, i16 %x, i32 0
+   ret <8 x i16>%res
+}
+
+define <16 x i16> @test19(i16 %x) {
+; X64-LABEL: test19:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test19:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %ymm0
+; X86-NEXT:    retl
+   %res = insertelement <16 x i16> undef, i16 %x, i32 0
+   ret <16 x i16>%res
+}
+
+define <32 x i16> @test20(i16 %x) {
+; X64-LABEL: test20:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test20:
+; X86:       # %bb.0:
+; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm0
+; X86-NEXT:    retl
+   %res = insertelement <32 x i16> undef, i16 %x, i32 0
+   ret <32 x i16>%res
+}
+
+@g8f16 = external global <8 x half>
+@g8f16u = external global <8 x half>, align 8
+@g16f16 = external global <16 x half>
+@g16f16u = external global <16 x half>, align 8
+@g32f16 = external global <32 x half>
+@g32f16u = external global <32 x half>, align 8
+
+define <32 x half> @load32f16(<32 x half>* %a) {
+; X64-LABEL: load32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps (%rdi), %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: load32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %zmm0
+; X86-NEXT:    retl
+  %res = load <32 x half>, <32 x half>* %a
+  ret <32 x half> %res
+}
+
+define <32 x half> @load32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
+; X64-LABEL: load32f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load32f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i32 %c to <32 x i1>
+  %res0 = load <32 x half>, <32 x half>* %a
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
+  ret <32 x half> %res
+}
+
+define <32 x half> @load32f16maskz(<32 x half>* %a, i32 %c) {
+; X64-LABEL: load32f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load32f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i32 %c to <32 x i1>
+  %res0 = load <32 x half>, <32 x half>* %a
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <32 x half> @loadu32f16(<32 x half>* %a) {
+; X64-LABEL: loadu32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovups (%rdi), %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovups (%eax), %zmm0
+; X86-NEXT:    retl
+  %res = load <32 x half>, <32 x half>* %a, align 8
+  ret <32 x half> %res
+}
+
+define <32 x half> @loadu32f16mask(<32 x half>* %a, <32 x half> %b, i32 %c) {
+; X64-LABEL: loadu32f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu32f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i32 %c to <32 x i1>
+  %res0 = load <32 x half>, <32 x half>* %a, align 8
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %b
+  ret <32 x half> %res
+}
+
+define <32 x half> @loadu32f16maskz(<32 x half>* %a, i32 %c) {
+; X64-LABEL: loadu32f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu32f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i32 %c to <32 x i1>
+  %res0 = load <32 x half>, <32 x half>* %a, align 8
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define void @store32f16(<32 x half> %a) {
+; X64-LABEL: store32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g32f16@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovaps %zmm0, (%rax)
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: store32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %zmm0, g32f16
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  store <32 x half> %a, <32 x half>* @g32f16
+  ret void
+}
+
+define void @storeu32f16(<32 x half> %a) {
+; X64-LABEL: storeu32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g32f16u@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovups %zmm0, (%rax)
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovups %zmm0, g32f16u
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  store <32 x half> %a, <32 x half>* @g32f16u, align 8
+  ret void
+}
+
+declare void @llvm.masked.store.v32f16.p0v32f16(<32 x half>, <32 x half>*, i32, <32 x i1>)
+declare <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>*, i32,  <32 x i1>, <32 x half>)
+
+define void @storeu32f16mask(<32 x i1> %mask, <32 x half>* %addr, <32 x half> %val) {
+; X64-LABEL: storeu32f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NEXT:    vpmovb2m %ymm0, %k1
+; X64-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu32f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X86-NEXT:    vpmovb2m %ymm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 %zmm1, (%eax) {%k1}
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  call void @llvm.masked.store.v32f16.p0v32f16(<32 x half> %val, <32 x half>* %addr, i32 4, <32 x i1>%mask)
+  ret void
+}
+
+define <32 x half> @maskloadu32f16(<32 x half>* %addr, <32 x half> %val, <32 x i1> %mask) {
+; X64-LABEL: maskloadu32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %ymm1, %ymm1
+; X64-NEXT:    vpmovb2m %ymm1, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskloadu32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %ymm1, %ymm1
+; X86-NEXT:    vpmovb2m %ymm1, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1}
+; X86-NEXT:    retl
+  %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> %val)
+  ret <32 x half> %res
+}
+
+define <32 x half> @maskuloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
+; X64-LABEL: maskuloadu32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NEXT:    vpmovb2m %ymm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskuloadu32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X86-NEXT:    vpmovb2m %ymm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> undef)
+  ret <32 x half> %res
+}
+
+define <32 x half> @maskzloadu32f16(<32 x half>* %addr, <32 x i1> %mask) {
+; X64-LABEL: maskzloadu32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X64-NEXT:    vpmovb2m %ymm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskzloadu32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %ymm0, %ymm0
+; X86-NEXT:    vpmovb2m %ymm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <32 x half> @llvm.masked.load.v32f16.p0v32f16(<32 x half>* %addr, i32 4, <32 x i1> %mask, <32 x half> zeroinitializer)
+  ret <32 x half> %res
+}
+
+define <32 x half> @movrr32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: movrr32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  ret <32 x half> %b
+}
+
+define <32 x half> @movrrk32f16(<32 x half> %a, <32 x half> %b, i32 %msk) {
+; X64-LABEL: movrrk32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrk32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    retl
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %res
+}
+
+define <32 x half> @movrrkz32f16(<32 x half> %a, i32 %msk) {
+; X64-LABEL: movrrkz32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrkz32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res = select <32 x i1> %mask, <32 x half> %a, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <16 x half> @load16f16(<16 x half>* %a) {
+; X64-LABEL: load16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps (%rdi), %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: load16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %ymm0
+; X86-NEXT:    retl
+  %res = load <16 x half>, <16 x half>* %a
+  ret <16 x half> %res
+}
+
+define <16 x half> @load16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
+; X64-LABEL: load16f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load16f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i16 %c to <16 x i1>
+  %res0 = load <16 x half>, <16 x half>* %a
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
+  ret <16 x half> %res
+}
+
+define <16 x half> @load16f16maskz(<16 x half>* %a, i16 %c) {
+; X64-LABEL: load16f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load16f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i16 %c to <16 x i1>
+  %res0 = load <16 x half>, <16 x half>* %a
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <16 x half> @loadu16f16(<16 x half>* %a) {
+; X64-LABEL: loadu16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovups (%rdi), %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovups (%eax), %ymm0
+; X86-NEXT:    retl
+  %res = load <16 x half>, <16 x half>* %a, align 8
+  ret <16 x half> %res
+}
+
+define <16 x half> @loadu16f16mask(<16 x half>* %a, <16 x half> %b, i16 %c) {
+; X64-LABEL: loadu16f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu16f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i16 %c to <16 x i1>
+  %res0 = load <16 x half>, <16 x half>* %a, align 8
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %b
+  ret <16 x half> %res
+}
+
+define <16 x half> @loadu16f16maskz(<16 x half>* %a, i16 %c) {
+; X64-LABEL: loadu16f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu16f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i16 %c to <16 x i1>
+  %res0 = load <16 x half>, <16 x half>* %a, align 8
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define void @store16f16(<16 x half> %a) {
+; X64-LABEL: store16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g16f16@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovaps %ymm0, (%rax)
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: store16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %ymm0, g16f16
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  store <16 x half> %a, <16 x half>* @g16f16
+  ret void
+}
+
+define void @storeu16f16(<16 x half> %a) {
+; X64-LABEL: storeu16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g16f16u@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovups %ymm0, (%rax)
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovups %ymm0, g16f16u
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  store <16 x half> %a, <16 x half>* @g16f16u, align 8
+  ret void
+}
+
+declare void @llvm.masked.store.v16f16.p0v16f16(<16 x half>, <16 x half>*, i32, <16 x i1>)
+declare <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>*, i32,  <16 x i1>, <16 x half>)
+
+define void @storeu16f16mask(<16 x i1> %mask, <16 x half>* %addr, <16 x half> %val) {
+; X64-LABEL: storeu16f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NEXT:    vpmovb2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu16f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X86-NEXT:    vpmovb2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 %ymm1, (%eax) {%k1}
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  call void @llvm.masked.store.v16f16.p0v16f16(<16 x half> %val, <16 x half>* %addr, i32 4, <16 x i1>%mask)
+  ret void
+}
+
+define <16 x half> @maskloadu16f16(<16 x half>* %addr, <16 x half> %val, <16 x i1> %mask) {
+; X64-LABEL: maskloadu16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %xmm1, %xmm1
+; X64-NEXT:    vpmovb2m %xmm1, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskloadu16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %xmm1, %xmm1
+; X86-NEXT:    vpmovb2m %xmm1, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1}
+; X86-NEXT:    retl
+  %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> %val)
+  ret <16 x half> %res
+}
+
+define <16 x half> @maskuloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
+; X64-LABEL: maskuloadu16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NEXT:    vpmovb2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskuloadu16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X86-NEXT:    vpmovb2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> undef)
+  ret <16 x half> %res
+}
+
+define <16 x half> @maskzloadu16f16(<16 x half>* %addr, <16 x i1> %mask) {
+; X64-LABEL: maskzloadu16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X64-NEXT:    vpmovb2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskzloadu16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $7, %xmm0, %xmm0
+; X86-NEXT:    vpmovb2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <16 x half> @llvm.masked.load.v16f16.p0v16f16(<16 x half>* %addr, i32 4, <16 x i1> %mask, <16 x half> zeroinitializer)
+  ret <16 x half> %res
+}
+
+define <16 x half> @movrr16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: movrr16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  ret <16 x half> %b
+}
+
+define <16 x half> @movrrk16f16(<16 x half> %a, <16 x half> %b, i16 %msk) {
+; X64-LABEL: movrrk16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrk16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; X86-NEXT:    retl
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %res
+}
+
+define <16 x half> @movrrkz16f16(<16 x half> %a, i16 %msk) {
+; X64-LABEL: movrrkz16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrkz16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 %ymm0, %ymm0 {%k1} {z}
+; X86-NEXT:    retl
+  %mask = bitcast i16 %msk to <16 x i1>
+  %res = select <16 x i1> %mask, <16 x half> %a, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <8 x half> @load8f16(<8 x half>* %a) {
+; X64-LABEL: load8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: load8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovaps (%eax), %xmm0
+; X86-NEXT:    retl
+  %res = load <8 x half>, <8 x half>* %a
+  ret <8 x half> %res
+}
+
+define <8 x half> @load8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
+; X64-LABEL: load8f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load8f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i8 %c to <8 x i1>
+  %res0 = load <8 x half>, <8 x half>* %a
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
+  ret <8 x half> %res
+}
+
+define <8 x half> @load8f16maskz(<8 x half>* %a, i8 %c) {
+; X64-LABEL: load8f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: load8f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i8 %c to <8 x i1>
+  %res0 = load <8 x half>, <8 x half>* %a
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <8 x half> @loadu8f16(<8 x half>* %a) {
+; X64-LABEL: loadu8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovups (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovups (%eax), %xmm0
+; X86-NEXT:    retl
+  %res = load <8 x half>, <8 x half>* %a, align 8
+  ret <8 x half> %res
+}
+
+define <8 x half> @loadu8f16mask(<8 x half>* %a, <8 x half> %b, i8 %c) {
+; X64-LABEL: loadu8f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu8f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
+; X86-NEXT:    retl
+  %msk = bitcast i8 %c to <8 x i1>
+  %res0 = load <8 x half>, <8 x half>* %a, align 8
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %b
+  ret <8 x half> %res
+}
+
+define <8 x half> @loadu8f16maskz(<8 x half>* %a, i8 %c) {
+; X64-LABEL: loadu8f16maskz:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %esi, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: loadu8f16maskz:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %msk = bitcast i8 %c to <8 x i1>
+  %res0 = load <8 x half>, <8 x half>* %a, align 8
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define void @store8f16(<8 x half> %a) {
+; X64-LABEL: store8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g8f16@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovaps %xmm0, (%rax)
+; X64-NEXT:    retq
+;
+; X86-LABEL: store8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovaps %xmm0, g8f16
+; X86-NEXT:    retl
+  store <8 x half> %a, <8 x half>* @g8f16
+  ret void
+}
+
+define void @storeu8f16(<8 x half> %a) {
+; X64-LABEL: storeu8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    movq g8f16u@GOTPCREL(%rip), %rax
+; X64-NEXT:    vmovups %xmm0, (%rax)
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovups %xmm0, g8f16u
+; X86-NEXT:    retl
+  store <8 x half> %a, <8 x half>* @g8f16u, align 8
+  ret void
+}
+
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32,  <8 x i1>, <8 x half>)
+
+define void @storeu8f16mask(<8 x i1> %mask, <8 x half>* %addr, <8 x half> %val) {
+; X64-LABEL: storeu8f16mask:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NEXT:    vpmovw2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 %xmm1, (%rdi) {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: storeu8f16mask:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X86-NEXT:    vpmovw2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 %xmm1, (%eax) {%k1}
+; X86-NEXT:    retl
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %val, <8 x half>* %addr, i32 4, <8 x i1>%mask)
+  ret void
+}
+
+define <8 x half> @maskloadu8f16(<8 x half>* %addr, <8 x half> %val, <8 x i1> %mask) {
+; X64-LABEL: maskloadu8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X64-NEXT:    vpmovw2m %xmm1, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskloadu8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $15, %xmm1, %xmm1
+; X86-NEXT:    vpmovw2m %xmm1, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1}
+; X86-NEXT:    retl
+  %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> %val)
+  ret <8 x half> %res
+}
+
+define <8 x half> @maskuloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
+; X64-LABEL: maskuloadu8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NEXT:    vpmovw2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskuloadu8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X86-NEXT:    vpmovw2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> undef)
+  ret <8 x half> %res
+}
+
+define <8 x half> @maskzloadu8f16(<8 x half>* %addr, <8 x i1> %mask) {
+; X64-LABEL: maskzloadu8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X64-NEXT:    vpmovw2m %xmm0, %k1
+; X64-NEXT:    vmovdqu16 (%rdi), %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: maskzloadu8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpsllw $15, %xmm0, %xmm0
+; X86-NEXT:    vpmovw2m %xmm0, %k1
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovdqu16 (%eax), %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %res = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %addr, i32 4, <8 x i1> %mask, <8 x half> zeroinitializer)
+  ret <8 x half> %res
+}
+
+define <8 x half> @movrr8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: movrr8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  ret <8 x half> %b
+}
+
+define <8 x half> @movrrk8f16(<8 x half> %a, <8 x half> %b, i8 %msk) {
+; X64-LABEL: movrrk8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrk8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    retl
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %res
+}
+
+define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
+; X64-LABEL: movrrkz8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+;
+; X86-LABEL: movrrkz8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vmovdqu16 %xmm0, %xmm0 {%k1} {z}
+; X86-NEXT:    retl
+  %mask = bitcast i8 %msk to <8 x i1>
+  %res = select <8 x i1> %mask, <8 x half> %a, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define i16 @test_movw(half %x) {
+; X64-LABEL: test_movw:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_movw:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+  %res = bitcast half %x to i16
+  ret i16 %res
+}
+
+define half @test_movw2(i16 %x) {
+; X64-LABEL: test_movw2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw %edi, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_movw2:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    retl
+  %res = bitcast i16 %x to half
+  ret half %res
+}
+
+; sext avoids having a truncate in front of the bitcast input due to calling
+; convention or i16 op promotion.
+define half @test_movw3(i8 %x) {
+; X64-LABEL: test_movw3:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    vmovw %eax, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_movw3:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovw %eax, %xmm0
+; X86-NEXT:    retl
+  %z = sext i8 %x to i16
+  %a = bitcast i16 %z to half
+  ret half %a
+}
+
+define half @extract_f16_0(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 0
+   ret half %res
+}
+
+define half @extract_f16_1(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 1
+   ret half %res
+}
+
+define half @extract_f16_2(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 2
+   ret half %res
+}
+
+define half @extract_f16_3(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 3
+   ret half %res
+}
+
+define half @extract_f16_4(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 4
+   ret half %res
+}
+
+define half @extract_f16_5(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 5
+   ret half %res
+}
+
+define half @extract_f16_6(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 6
+   ret half %res
+}
+
+define half @extract_f16_7(<8 x half> %x) {
+; CHECK-LABEL: extract_f16_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x half> %x, i32 7
+   ret half %res
+}
+
+define i16 @extract_i16_0(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 0
+   ret i16 %res
+}
+
+define i16 @extract_i16_1(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 1
+   ret i16 %res
+}
+
+define i16 @extract_i16_2(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $2, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 2
+   ret i16 %res
+}
+
+define i16 @extract_i16_3(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 3
+   ret i16 %res
+}
+
+define i16 @extract_i16_4(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $4, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 4
+   ret i16 %res
+}
+
+define i16 @extract_i16_5(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $5, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 5
+   ret i16 %res
+}
+
+define i16 @extract_i16_6(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $6, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 6
+   ret i16 %res
+}
+
+define i16 @extract_i16_7(<8 x i16> %x) {
+; CHECK-LABEL: extract_i16_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $7, %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 7
+   ret i16 %res
+}
+
+define void @extract_store_f16_0(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_0:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_0:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 0
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_1(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_1:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_1:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 1
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_2(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 2
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_3(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_3:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 3
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_4(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_4:
+; X64:       # %bb.0:
+; X64-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 4
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_5(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_5:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_5:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 5
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_6(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_6:
+; X64:       # %bb.0:
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_6:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 6
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_f16_7(<8 x half> %x, half* %y) {
+; X64-LABEL: extract_store_f16_7:
+; X64:       # %bb.0:
+; X64-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_f16_7:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x half> %x, i32 7
+   store half %res, half* %y
+   ret void
+}
+
+define void @extract_store_i16_0(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_0:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_0:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $0, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 0
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_1(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_1:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_1:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $1, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 1
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_2(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_2:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $2, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $2, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 2
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_3(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_3:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $3, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_3:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $3, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 3
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_4(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_4:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $4, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $4, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 4
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_5(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_5:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $5, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_5:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $5, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 5
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_6(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_6:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $6, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_6:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $6, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 6
+   store i16 %res, i16* %y
+   ret void
+}
+
+define void @extract_store_i16_7(<8 x i16> %x, i16* %y) {
+; X64-LABEL: extract_store_i16_7:
+; X64:       # %bb.0:
+; X64-NEXT:    vpextrw $7, %xmm0, (%rdi)
+; X64-NEXT:    retq
+;
+; X86-LABEL: extract_store_i16_7:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vpextrw $7, %xmm0, (%eax)
+; X86-NEXT:    retl
+   %res = extractelement <8 x i16> %x, i32 7
+   store i16 %res, i16* %y
+   ret void
+}
+
+define i32 @extract_zext_i16_0(<8 x i16> %x) {
+; CHECK-LABEL: extract_zext_i16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $0, %xmm0, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 0
+   %res2 = zext i16 %res to i32
+   ret i32 %res2
+}
+
+define i32 @extract_zext_i16_1(<8 x i16> %x) {
+; CHECK-LABEL: extract_zext_i16_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-NEXT:    ret{{[l|q]}}
+   %res = extractelement <8 x i16> %x, i32 1
+   %res2 = zext i16 %res to i32
+   ret i32 %res2
+}
+
+define <8 x half> @build_vector_xxxxuuuu(half %a0, half %a1, half %a2, half %a3) {
+; X64-LABEL: build_vector_xxxxuuuu:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; X64-NEXT:    retq
+;
+; X86-LABEL: build_vector_xxxxuuuu:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
+; X86-NEXT:    retl
+  %a = insertelement <8 x half> undef, half %a0, i32 0
+  %b = insertelement <8 x half> %a, half %a1, i32 1
+  %c = insertelement <8 x half> %b, half %a2, i32 2
+  %d = insertelement <8 x half> %c, half %a3, i32 3
+  ret <8 x half> %d
+}
+
+define <8 x half> @build_vector_uuuuxxxx(half %a0, half %a1, half %a2, half %a3) {
+; X64-LABEL: build_vector_uuuuxxxx:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: build_vector_uuuuxxxx:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = insertelement <8 x half> undef, half %a0, i32 4
+  %b = insertelement <8 x half> %a, half %a1, i32 5
+  %c = insertelement <8 x half> %b, half %a2, i32 6
+  %d = insertelement <8 x half> %c, half %a3, i32 7
+  ret <8 x half> %d
+}
+
+define <8 x half> @build_vector_xxxxxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
+; X64-LABEL: build_vector_xxxxxxxx:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X64-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; X64-NEXT:    retq
+;
+; X86-LABEL: build_vector_xxxxxxxx:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT:    retl
+  %a = insertelement <8 x half> undef, half %a0, i32 0
+  %b = insertelement <8 x half> %a, half %a1, i32 1
+  %c = insertelement <8 x half> %b, half %a2, i32 2
+  %d = insertelement <8 x half> %c, half %a3, i32 3
+  %e = insertelement <8 x half> %d, half %a4, i32 4
+  %f = insertelement <8 x half> %e, half %a5, i32 5
+  %g = insertelement <8 x half> %f, half %a6, i32 6
+  %h = insertelement <8 x half> %g, half %a7, i32 7
+  ret <8 x half> %h
+}
+
+define <16 x half> @build_vector_xxxxuuuuuuuuxxxx(half %a0, half %a1, half %a2, half %a3, half %a4, half %a5, half %a6, half %a7) {
+; X64-LABEL: build_vector_xxxxuuuuuuuuxxxx:
+; X64:       # %bb.0:
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; X64-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X64-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; X64-NEXT:    vpbroadcastq %xmm1, %xmm1
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: build_vector_xxxxuuuuuuuuxxxx:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm1
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm2
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm3
+; X86-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero
+; X86-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT:    retl
+  %a = insertelement <16 x half> undef, half %a0, i32 0
+  %b = insertelement <16 x half> %a, half %a1, i32 1
+  %c = insertelement <16 x half> %b, half %a2, i32 2
+  %d = insertelement <16 x half> %c, half %a3, i32 3
+  %e = insertelement <16 x half> %d, half %a4, i32 12
+  %f = insertelement <16 x half> %e, half %a5, i32 13
+  %g = insertelement <16 x half> %f, half %a6, i32 14
+  %h = insertelement <16 x half> %g, half %a7, i32 15
+  ret <16 x half> %h
+}
+
+define <8 x half> @regression1(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: regression1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
+  ret <8 x half> %res
+}
+
+define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2, <4 x float> %3, i8* %4) {
+; X64-LABEL: regression2:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovw (%rsi), %xmm0
+; X64-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X64-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: regression2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovw (%eax), %xmm0
+; X86-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X86-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3]
+; X86-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %6 = getelementptr i8, i8* %4, i64 0
+  %7 = getelementptr i8, i8* %6, i64 0
+  %8 = getelementptr i8, i8* %7, i64 0
+  %9 = load i8, i8* %8, align 1
+  %10 = getelementptr i8, i8* %8, i64 1
+  %11 = addrspacecast i8* %10 to i8 addrspace(4)*
+  %12 = load i8, i8 addrspace(4)* %11, align 1
+  %13 = insertelement <2 x i8> poison, i8 %9, i32 0
+  %14 = insertelement <2 x i8> %13, i8 %12, i32 1
+  %15 = uitofp <2 x i8> %14 to <2 x float>
+  %16 = shufflevector <2 x float> %15, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %17 = shufflevector <4 x float> %16, <4 x float> <float poison, float poison, float 0.000000e+00, float 2.550000e+02>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  %18 = fmul contract <4 x float> %17, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
+  ret <4 x float> %18
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s
+
+define void @test_mscatter_v16f16(half* %base, <16 x i32> %index, <16 x half> %val)
+; CHECK-LABEL: test_mscatter_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm3
+; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; CHECK-NEXT:    vpmovsxdq %ymm2, %zmm2
+; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm4
+; CHECK-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm3
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpsrld $16, %xmm1, %xmm3
+; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; CHECK-NEXT:    vmovq %xmm4, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vpsrlq $48, %xmm1, %xmm3
+; CHECK-NEXT:    vpextrq $1, %xmm4, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; CHECK-NEXT:    vmovq %xmm4, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpextrq $1, %xmm4, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3]
+; CHECK-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; CHECK-NEXT:    vmovq %xmm0, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NEXT:    vmovsh %xmm3, (%rax)
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm0
+; CHECK-NEXT:    vmovq %xmm2, %rax
+; CHECK-NEXT:    vmovsh %xmm0, (%rax)
+; CHECK-NEXT:    vpsrld $16, %xmm0, %xmm1
+; CHECK-NEXT:    vpextrq $1, %xmm2, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; CHECK-NEXT:    vmovq %xmm3, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; CHECK-NEXT:    vpextrq $1, %xmm3, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; CHECK-NEXT:    vextracti32x4 $2, %zmm2, %xmm3
+; CHECK-NEXT:    vmovq %xmm3, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpextrq $1, %xmm3, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; CHECK-NEXT:    vextracti32x4 $3, %zmm2, %xmm2
+; CHECK-NEXT:    vmovq %xmm2, %rax
+; CHECK-NEXT:    vmovsh %xmm1, (%rax)
+; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    vpextrq $1, %xmm2, %rax
+; CHECK-NEXT:    vmovsh %xmm0, (%rax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+{
+  %gep = getelementptr half, half* %base, <16 x i32> %index
+  call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> %val, <16 x half*> %gep, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+declare void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> , <16 x half*> , i32 , <16 x i1>)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll b/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-subv-broadcast-fp16.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512fp16 | FileCheck %s
+
+define dso_local void @test_v8f16_v32f16(<8 x half>* %x_addr, <32 x half>* %y_addr) {
+; CHECK-LABEL: test_v8f16_v32f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <8 x half>, <8 x half>* %x_addr, align 16
+  %shuffle.i58 = shufflevector <8 x half> %0, <8 x half> %0, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <32 x half> %shuffle.i58, <32 x half>* %y_addr, align 64
+  ret void
+}
+
+define dso_local void @test_v8f16_v16f16(<8 x half>* %x_addr, <16 x half>* %y_addr) {
+; CHECK-LABEL: test_v8f16_v16f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; CHECK-NEXT:    vmovdqa %ymm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <8 x half>, <8 x half>* %x_addr, align 16
+  %shuffle.i58 = shufflevector <8 x half> %0, <8 x half> %0, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <16 x half> %shuffle.i58, <16 x half>* %y_addr, align 64
+  ret void
+}
+
+define dso_local void @test_v16f16_v32f16(<16 x half>* %x_addr, <32 x half>* %y_addr) {
+; CHECK-LABEL: test_v16f16_v32f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load <16 x half>, <16 x half>* %x_addr, align 16
+  %shuffle.i58 = shufflevector <16 x half> %0, <16 x half> %0, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  store <32 x half> %shuffle.i58, <32 x half>* %y_addr, align 64
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s
+
+define signext i16 @test_mm_cvtsi128_si16(<2 x i64> %A) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_cvtsi128_si16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast <2 x i64> %A to <8 x i16>
+  %vecext.i = extractelement <8 x i16> %0, i32 0
+  ret i16 %vecext.i
+}
+
+define <2 x i64> @test_mm_cvtsi16_si128(i16 signext %A) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_cvtsi16_si128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovw %edi, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %vecinit7.i = insertelement <8 x i16> <i16 undef, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, i16 %A, i32 0
+  %0 = bitcast <8 x i16> %vecinit7.i to <2 x i64>
+  ret <2 x i64> %0
+}
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -3,17 +3,69 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE
 ; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX
-; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -mtriple=x86_64-linux-android -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+avx512fp16 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512
 ; RUN: llc < %s -mtriple=i686-linux-gnu -mattr=-sse | FileCheck %s --check-prefixes=X86
 
 ; Check soft floating point conversion function calls.
 
+@vf16 = common dso_local global half 0.000000e+00, align 2
 @vf32 = common dso_local global float 0.000000e+00, align 4
 @vf64 = common dso_local global double 0.000000e+00, align 8
 @vf80 = common dso_local global x86_fp80 0xK00000000000000000000, align 8
 @vf128 = common dso_local global fp128 0xL00000000000000000000000000000000, align 16
 
+define dso_local void @TestFPExtF16_F128() nounwind strictfp {
+; X64-SSE-LABEL: TestFPExtF16_F128:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movzwl vf16(%rip), %edi
+; X64-SSE-NEXT:    callq __gnu_h2f_ieee@PLT
+; X64-SSE-NEXT:    callq __extendsftf2@PLT
+; X64-SSE-NEXT:    movaps %xmm0, vf128(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX512-LABEL: TestFPExtF16_F128:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    pushq %rax
+; X64-AVX512-NEXT:    vmovsh vf16(%rip), %xmm0
+; X64-AVX512-NEXT:    callq __extendhftf2@PLT
+; X64-AVX512-NEXT:    vmovaps %xmm0, vf128(%rip)
+; X64-AVX512-NEXT:    popq %rax
+; X64-AVX512-NEXT:    retq
+;
+; X86-LABEL: TestFPExtF16_F128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movzwl vf16, %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __gnu_h2f_ieee
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __extendsftf2
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, vf128+12
+; X86-NEXT:    movl %edx, vf128+8
+; X86-NEXT:    movl %ecx, vf128+4
+; X86-NEXT:    movl %eax, vf128
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+entry:
+  %0 = load half, half* @vf16, align 2
+  %conv = call fp128 @llvm.experimental.constrained.fpext.f128.f16(half %0, metadata !"fpexcept.strict") #0
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+}
+
 define dso_local void @TestFPExtF32_F128() nounwind strictfp {
 ; X64-SSE-LABEL: TestFPExtF32_F128:
 ; X64-SSE:       # %bb.0: # %entry
@@ -162,6 +214,44 @@
   ret void
 }
 
+define dso_local void @TestFPTruncF128_F16() nounwind strictfp {
+; X64-SSE-LABEL: TestFPTruncF128_F16:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movaps vf128(%rip), %xmm0
+; X64-SSE-NEXT:    callq __trunctfhf2@PLT
+; X64-SSE-NEXT:    movw %ax, vf16(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX512-LABEL: TestFPTruncF128_F16:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    pushq %rax
+; X64-AVX512-NEXT:    vmovaps vf128(%rip), %xmm0
+; X64-AVX512-NEXT:    callq __trunctfhf2@PLT
+; X64-AVX512-NEXT:    vmovsh %xmm0, vf16(%rip)
+; X64-AVX512-NEXT:    popq %rax
+; X64-AVX512-NEXT:    retq
+;
+; X86-LABEL: TestFPTruncF128_F16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl vf128+12
+; X86-NEXT:    pushl vf128+8
+; X86-NEXT:    pushl vf128+4
+; X86-NEXT:    pushl vf128
+; X86-NEXT:    calll __trunctfhf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movw %ax, vf16
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = call half @llvm.experimental.constrained.fptrunc.f16.f128(fp128 %0, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  store half %conv, half* @vf16, align 2
+  ret void
+}
+
 define dso_local void @TestFPTruncF128_F32() nounwind strictfp {
 ; X64-SSE-LABEL: TestFPTruncF128_F32:
 ; X64-SSE:       # %bb.0: # %entry
@@ -957,9 +1047,11 @@
 
 attributes #0 = { strictfp }
 
+declare half @llvm.experimental.constrained.fptrunc.f16.f128(fp128, metadata, metadata)
 declare float @llvm.experimental.constrained.fptrunc.f32.f128(fp128, metadata, metadata)
 declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.fptrunc.f80.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.fpext.f128.f16(half, metadata)
 declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata)
 declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata)
 declare fp128 @llvm.experimental.constrained.fpext.f128.f80(x86_fp80, metadata)
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s
+
+; This test checks that only a single jne gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
+; CHECK-LABEL: foo3:
+; CHECK: jne
+; CHECK-NOT: jne
+entry:
+  %spec.select = select i1 %sign, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %spec.select
+}
+
+; This test checks that only a single jne gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+define dso_local <16 x half> @foo4(<16 x half> %a, <16 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
+; CHECK-LABEL: foo4:
+; CHECK: jne
+; CHECK-NOT: jne
+entry:
+  %spec.select = select i1 %sign, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %spec.select
+}
+
+; This test checks that only a single jne gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+define dso_local <8 x half> @foo5(<8 x half> %a, <8 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
+; CHECK-LABEL: foo5:
+; CHECK: jne
+; CHECK-NOT: jne
+entry:
+  %spec.select = select i1 %sign, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %spec.select
+}
diff --git a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
--- a/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
+++ b/llvm/test/CodeGen/X86/statepoint-invoke-ra-enter-at-end.mir
@@ -340,7 +340,7 @@
   ; CHECK:   CMP64rr [[NOT64r2]], [[COPY6]], implicit-def $eflags
   ; CHECK:   undef %102.sub_32bit:gr64_with_sub_8bit = MOV32ri 0
   ; CHECK:   [[CMOV64rr:%[0-9]+]]:gr64 = CMOV64rr [[CMOV64rr]], %102, 4, implicit killed $eflags
-  ; CHECK:   INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %102, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+  ; CHECK:   INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %102, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
   ; CHECK:   LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, [[COPY5]], implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
   ; CHECK:   ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
   ; CHECK:   $rdi = COPY [[COPY4]]
@@ -456,7 +456,7 @@
     %63:gr64 = NOT64r %63
     CMP64rr %63, %31, implicit-def $eflags
     %63:gr64 = CMOV64rr %63, %53, 4, implicit killed $eflags
-    INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4325385 /* reguse:GR64 */, %53, 4325385 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
+    INLINEASM &"lock btsq $0,($1)", 1 /* sideeffect attdialect */, 4390921 /* reguse:GR64 */, %53, 4390921 /* reguse:GR64 */, undef %56:gr64, 12 /* clobber */, implicit-def dead early-clobber $df, 12 /* clobber */, implicit-def early-clobber $fpsw, 12 /* clobber */, implicit-def dead early-clobber $eflags
     LCMPXCHG32 undef %67:gr64, 1, $noreg, 0, $noreg, %65, implicit-def dead $eax, implicit-def dead $eflags, implicit undef $eax :: (load store acquire monotonic (s32) on `i32 addrspace(1)* undef`, addrspace 1)
     ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
     $rdi = COPY %64
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
@@ -0,0 +1,78 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vmovsh %xmm28, %xmm29, %xmm30
+# INTEL: vmovsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x10,0xf4
+
+# ATT:   vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7}
+# INTEL: vmovsh xmm30 {k7}, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7e,0x0f,0x10,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmovsh (%r9), %xmm30
+# INTEL: vmovsh xmm30, word ptr [r9]
+0x62,0x45,0x7e,0x08,0x10,0x31
+
+# ATT:   vmovsh 254(%rcx), %xmm30
+# INTEL: vmovsh xmm30, word ptr [rcx + 254]
+0x62,0x65,0x7e,0x08,0x10,0x71,0x7f
+
+# ATT:   vmovsh -256(%rdx), %xmm30 {%k7} {z}
+# INTEL: vmovsh xmm30 {k7} {z}, word ptr [rdx - 256]
+0x62,0x65,0x7e,0x8f,0x10,0x72,0x80
+
+# ATT:   vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7}
+# INTEL: vmovsh word ptr [rbp + 8*r14 + 268435456] {k7}, xmm30
+0x62,0x25,0x7e,0x0f,0x11,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmovsh %xmm30, (%r9)
+# INTEL: vmovsh word ptr [r9], xmm30
+0x62,0x45,0x7e,0x08,0x11,0x31
+
+# ATT:   vmovsh %xmm30, 254(%rcx)
+# INTEL: vmovsh word ptr [rcx + 254], xmm30
+0x62,0x65,0x7e,0x08,0x11,0x71,0x7f
+
+# ATT:   vmovsh %xmm30, -256(%rdx) {%k7}
+# INTEL: vmovsh word ptr [rdx - 256] {k7}, xmm30
+0x62,0x65,0x7e,0x0f,0x11,0x72,0x80
+
+# ATT:   vmovw %r12d, %xmm30
+# INTEL: vmovw xmm30, r12d
+0x62,0x45,0x7d,0x08,0x6e,0xf4
+
+# ATT:   vmovw %xmm30, %r12d
+# INTEL: vmovw r12d, xmm30
+0x62,0x45,0x7d,0x08,0x7e,0xf4
+
+# ATT:   vmovw 268435456(%rbp,%r14,8), %xmm30
+# INTEL: vmovw xmm30, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmovw (%r9), %xmm30
+# INTEL: vmovw xmm30, word ptr [r9]
+0x62,0x45,0x7d,0x08,0x6e,0x31
+
+# ATT:   vmovw 254(%rcx), %xmm30
+# INTEL: vmovw xmm30, word ptr [rcx + 254]
+0x62,0x65,0x7d,0x08,0x6e,0x71,0x7f
+
+# ATT:   vmovw -256(%rdx), %xmm30
+# INTEL: vmovw xmm30, word ptr [rdx - 256]
+0x62,0x65,0x7d,0x08,0x6e,0x72,0x80
+
+# ATT:   vmovw %xmm30, 268435456(%rbp,%r14,8)
+# INTEL: vmovw word ptr [rbp + 8*r14 + 268435456], xmm30
+0x62,0x25,0x7d,0x08,0x7e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmovw %xmm30, (%r9)
+# INTEL: vmovw word ptr [r9], xmm30
+0x62,0x45,0x7d,0x08,0x7e,0x31
+
+# ATT:   vmovw %xmm30, 254(%rcx)
+# INTEL: vmovw word ptr [rcx + 254], xmm30
+0x62,0x65,0x7d,0x08,0x7e,0x71,0x7f
+
+# ATT:   vmovw %xmm30, -256(%rdx)
+# INTEL: vmovw word ptr [rdx - 256], xmm30
+0x62,0x65,0x7d,0x08,0x7e,0x72,0x80
diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/avx512fp16.s
@@ -0,0 +1,77 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s  | FileCheck %s
+
+// CHECK: vmovsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x10,0xf4]
+          vmovsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x10,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmovsh 268435456(%rbp,%r14,8), %xmm30 {%k7}
+
+// CHECK: vmovsh (%r9), %xmm30
+// CHECK: encoding: [0x62,0x45,0x7e,0x08,0x10,0x31]
+          vmovsh (%r9), %xmm30
+
+// CHECK: vmovsh 254(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x10,0x71,0x7f]
+          vmovsh 254(%rcx), %xmm30
+
+// CHECK: vmovsh -256(%rdx), %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7e,0x8f,0x10,0x72,0x80]
+          vmovsh -256(%rdx), %xmm30 {%k7} {z}
+
+// CHECK: vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7}
+// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x11,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmovsh %xmm30, 268435456(%rbp,%r14,8) {%k7}
+
+// CHECK: vmovsh %xmm30, (%r9)
+// CHECK: encoding: [0x62,0x45,0x7e,0x08,0x11,0x31]
+          vmovsh %xmm30, (%r9)
+
+// CHECK: vmovsh %xmm30, 254(%rcx)
+// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x11,0x71,0x7f]
+          vmovsh %xmm30, 254(%rcx)
+
+// CHECK: vmovsh %xmm30, -256(%rdx) {%k7}
+// CHECK: encoding: [0x62,0x65,0x7e,0x0f,0x11,0x72,0x80]
+          vmovsh %xmm30, -256(%rdx) {%k7}
+
+// CHECK: vmovw %r12d, %xmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x6e,0xf4]
+          vmovw %r12d, %xmm30
+
+// CHECK: vmovw %xmm30, %r12d
+// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x7e,0xf4]
+          vmovw %xmm30, %r12d
+
+// CHECK: vmovw 268435456(%rbp,%r14,8), %xmm30
+// CHECK: encoding: [0x62,0x25,0x7d,0x08,0x6e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmovw 268435456(%rbp,%r14,8), %xmm30
+
+// CHECK: vmovw (%r9), %xmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x6e,0x31]
+          vmovw (%r9), %xmm30
+
+// CHECK: vmovw 254(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x6e,0x71,0x7f]
+          vmovw 254(%rcx), %xmm30
+
+// CHECK: vmovw -256(%rdx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x6e,0x72,0x80]
+          vmovw -256(%rdx), %xmm30
+
+// CHECK: vmovw %xmm30, 268435456(%rbp,%r14,8)
+// CHECK: encoding: [0x62,0x25,0x7d,0x08,0x7e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmovw %xmm30, 268435456(%rbp,%r14,8)
+
+// CHECK: vmovw %xmm30, (%r9)
+// CHECK: encoding: [0x62,0x45,0x7d,0x08,0x7e,0x31]
+          vmovw %xmm30, (%r9)
+
+// CHECK: vmovw %xmm30, 254(%rcx)
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x71,0x7f]
+          vmovw %xmm30, 254(%rcx)
+
+// CHECK: vmovw %xmm30, -256(%rdx)
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x72,0x80]
+          vmovw %xmm30, -256(%rdx)
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
new file mode 100644
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
@@ -0,0 +1,77 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vmovsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x10,0xf4]
+          vmovsh xmm6, xmm5, xmm4
+
+// CHECK: vmovsh xmm6 {k7}, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x10,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmovsh xmm6 {k7}, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmovsh xmm6, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x10,0x31]
+          vmovsh xmm6, word ptr [ecx]
+
+// CHECK: vmovsh xmm6, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x10,0x71,0x7f]
+          vmovsh xmm6, word ptr [ecx + 254]
+
+// CHECK: vmovsh xmm6 {k7} {z}, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x8f,0x10,0x72,0x80]
+          vmovsh xmm6 {k7} {z}, word ptr [edx - 256]
+
+// CHECK: vmovsh word ptr [esp + 8*esi + 268435456] {k7}, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x11,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmovsh word ptr [esp + 8*esi + 268435456] {k7}, xmm6
+
+// CHECK: vmovsh word ptr [ecx], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x11,0x31]
+          vmovsh word ptr [ecx], xmm6
+
+// CHECK: vmovsh word ptr [ecx + 254], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x11,0x71,0x7f]
+          vmovsh word ptr [ecx + 254], xmm6
+
+// CHECK: vmovsh word ptr [edx - 256] {k7}, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x11,0x72,0x80]
+          vmovsh word ptr [edx - 256] {k7}, xmm6
+
+// CHECK: vmovw xmm6, edx
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0xf2]
+          vmovw xmm6, edx
+
+// CHECK: vmovw edx, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0xf2]
+          vmovw edx, xmm6
+
+// CHECK: vmovw xmm6, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmovw xmm6, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmovw xmm6, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x31]
+          vmovw xmm6, word ptr [ecx]
+
+// CHECK: vmovw xmm6, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x71,0x7f]
+          vmovw xmm6, word ptr [ecx + 254]
+
+// CHECK: vmovw xmm6, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x6e,0x72,0x80]
+          vmovw xmm6, word ptr [edx - 256]
+
+// CHECK: vmovw word ptr [esp + 8*esi + 268435456], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmovw word ptr [esp + 8*esi + 268435456], xmm6
+
+// CHECK: vmovw word ptr [ecx], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x31]
+          vmovw word ptr [ecx], xmm6
+
+// CHECK: vmovw word ptr [ecx + 254], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x71,0x7f]
+          vmovw word ptr [ecx + 254], xmm6
+
+// CHECK: vmovw word ptr [edx - 256], xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x72,0x80]
+          vmovw word ptr [edx - 256], xmm6
diff --git a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
--- a/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
+++ b/llvm/test/MachineVerifier/test_copy_physregs_x86.mir
@@ -28,7 +28,7 @@
   bb.0:
     liveins: $xmm0, $xmm1, $xmm2, $xmm3
 
-    ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+    ; FP16 reg is sub_reg of xmm
     %0:_(s16) = COPY $xmm0
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
@@ -40,7 +40,7 @@
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
     %3:_(<8 x s32>) = COPY $xmm3
 
-    ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
+    ; FP16 reg is sub_reg of xmm
     $xmm0 = COPY %0
 
     ; CHECK: *** Bad machine code: Copy Instruction is illegal with mismatching sizes ***
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.h b/llvm/utils/TableGen/X86DisassemblerTables.h
--- a/llvm/utils/TableGen/X86DisassemblerTables.h
+++ b/llvm/utils/TableGen/X86DisassemblerTables.h
@@ -41,7 +41,9 @@
   /// [5] XOP9 map opcode
   /// [6] XOPA map opcode
   /// [7] 3dnow map opcode
-  std::unique_ptr<ContextDecision> Tables[8];
+  /// [8] fixed length MAP5 opcode
+  /// [9] fixed length MAP6 opcode
+  std::unique_ptr<ContextDecision> Tables[10];
 
   // Table of ModRM encodings.
   typedef std::map<std::vector<unsigned>, unsigned> ModRMMapTy;
diff --git a/llvm/utils/TableGen/X86DisassemblerTables.cpp b/llvm/utils/TableGen/X86DisassemblerTables.cpp
--- a/llvm/utils/TableGen/X86DisassemblerTables.cpp
+++ b/llvm/utils/TableGen/X86DisassemblerTables.cpp
@@ -994,6 +994,8 @@
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[5], XOP9_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[6], XOPA_MAP_STR);
   emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[7], THREEDNOW_MAP_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[8], MAP5_STR);
+  emitContextDecision(o1, o2, i1, i2, ModRMTableNum, *Tables[9], MAP6_STR);
 }
 
 void DisassemblerTables::emit(raw_ostream &o) const {
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -130,7 +130,8 @@
   };
 
   enum {
-    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7
+    OB = 0, TB = 1, T8 = 2, TA = 3, XOP8 = 4, XOP9 = 5, XOPA = 6, ThreeDNow = 7,
+    T_MAP5 = 8, T_MAP6 = 9
   };
 
   enum {
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp
--- a/llvm/utils/TableGen/X86RecognizableInstr.cpp
+++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp
@@ -752,6 +752,8 @@
   case X86Local::XOP9:      opcodeType = XOP9_MAP;      break;
   case X86Local::XOPA:      opcodeType = XOPA_MAP;      break;
   case X86Local::ThreeDNow: opcodeType = THREEDNOW_MAP; break;
+  case X86Local::T_MAP5:    opcodeType = MAP5;          break;
+  case X86Local::T_MAP6:    opcodeType = MAP6;          break;
   }
 
   std::unique_ptr<ModRMFilter> filter;
@@ -901,10 +903,13 @@
   TYPE("FR64X",               TYPE_XMM)
   TYPE("f64mem",              TYPE_M)
   TYPE("sdmem",               TYPE_M)
+  TYPE("FR16X",               TYPE_XMM)
   TYPE("FR32",                TYPE_XMM)
   TYPE("FR32X",               TYPE_XMM)
   TYPE("f32mem",              TYPE_M)
+  TYPE("f16mem",              TYPE_M)
   TYPE("ssmem",               TYPE_M)
+  TYPE("shmem",               TYPE_M)
   TYPE("RST",                 TYPE_ST)
   TYPE("RSTi",                TYPE_ST)
   TYPE("i128mem",             TYPE_M)
@@ -1019,6 +1024,7 @@
   ENCODING("FR128",           ENCODING_IB)
   ENCODING("VR128",           ENCODING_IB)
   ENCODING("VR256",           ENCODING_IB)
+  ENCODING("FR16X",           ENCODING_IB)
   ENCODING("FR32X",           ENCODING_IB)
   ENCODING("FR64X",           ENCODING_IB)
   ENCODING("VR128X",          ENCODING_IB)
@@ -1047,6 +1053,7 @@
   ENCODING("FR32",            ENCODING_RM)
   ENCODING("FR64X",           ENCODING_RM)
   ENCODING("FR32X",           ENCODING_RM)
+  ENCODING("FR16X",           ENCODING_RM)
   ENCODING("VR64",            ENCODING_RM)
   ENCODING("VR256",           ENCODING_RM)
   ENCODING("VR256X",          ENCODING_RM)
@@ -1091,6 +1098,7 @@
   ENCODING("VR128X",          ENCODING_REG)
   ENCODING("FR64X",           ENCODING_REG)
   ENCODING("FR32X",           ENCODING_REG)
+  ENCODING("FR16X",           ENCODING_REG)
   ENCODING("VR512",           ENCODING_REG)
   ENCODING("VK1",             ENCODING_REG)
   ENCODING("VK2",             ENCODING_REG)
@@ -1127,6 +1135,7 @@
   ENCODING("FR64",            ENCODING_VVVV)
   ENCODING("VR128",           ENCODING_VVVV)
   ENCODING("VR256",           ENCODING_VVVV)
+  ENCODING("FR16X",           ENCODING_VVVV)
   ENCODING("FR32X",           ENCODING_VVVV)
   ENCODING("FR64X",           ENCODING_VVVV)
   ENCODING("VR128X",          ENCODING_VVVV)
@@ -1170,6 +1179,7 @@
   ENCODING("i32mem",          ENCODING_RM)
   ENCODING("i64mem",          ENCODING_RM)
   ENCODING("i8mem",           ENCODING_RM)
+  ENCODING("shmem",           ENCODING_RM)
   ENCODING("ssmem",           ENCODING_RM)
   ENCODING("sdmem",           ENCODING_RM)
   ENCODING("f128mem",         ENCODING_RM)
@@ -1177,6 +1187,7 @@
   ENCODING("f512mem",         ENCODING_RM)
   ENCODING("f64mem",          ENCODING_RM)
   ENCODING("f32mem",          ENCODING_RM)
+  ENCODING("f16mem",          ENCODING_RM)
   ENCODING("i128mem",         ENCODING_RM)
   ENCODING("i256mem",         ENCODING_RM)
   ENCODING("i512mem",         ENCODING_RM)