Index: include/llvm/IR/IntrinsicsX86.td
===================================================================
--- include/llvm/IR/IntrinsicsX86.td
+++ include/llvm/IR/IntrinsicsX86.td
@@ -1680,20 +1680,6 @@
                   [IntrReadMem, IntrArgMemOnly]>;
 }
 
-// Conditional move ops
-let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_move_ss :
-        GCCBuiltin<"__builtin_ia32_movss_mask">,
-          Intrinsic<[llvm_v4f32_ty],
-                    [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
-  def int_x86_avx512_mask_move_sd :
-        GCCBuiltin<"__builtin_ia32_movsd_mask">,
-          Intrinsic<[llvm_v2f64_ty],
-                    [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
-}
-
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">,
Index: lib/IR/AutoUpgrade.cpp
===================================================================
--- lib/IR/AutoUpgrade.cpp
+++ lib/IR/AutoUpgrade.cpp
@@ -338,6 +338,8 @@
          Name.startswith("avx2.pblendd.") ||
          Name.startswith("avx.vbroadcastf128") ||
          Name == "avx2.vbroadcasti128" ||
+         Name.startswith("avx512.mask.move.ss") ||
+         Name.startswith("avx512.mask.move.sd") ||
          Name == "xop.vpcmov" ||
          (Name.startswith("xop.vpcom") && F->arg_size() == 2))) {
       NewFn = nullptr;
@@ -675,6 +677,20 @@
                                                      std::max(NumElts, 8U)));
 }
 
+static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
+  Value* a = CI.getArgOperand(0);
+  Value* b = CI.getArgOperand(1);
+  Value* src = CI.getArgOperand(2);
+  Value* mask = CI.getArgOperand(3);
+
+  Value* andNode = Builder.CreateAnd(mask, APInt(8, 1));
+  Value* cmp = Builder.CreateIsNotNull(andNode);
+  Value* extract1 = Builder.CreateExtractElement(b, (uint64_t)0);
+  Value* extract2 = Builder.CreateExtractElement(src, (uint64_t)0);
+  Value* select = Builder.CreateSelect(cmp, extract1, extract2);
+  return Builder.CreateInsertElement(a, select, (uint64_t)0);
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -1332,6 +1348,9 @@
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && (Name.startswith("avx512.mask.move.ss") ||
+                         Name.startswith("avx512.mask.move.sd"))) { 
+      Rep = upgradeMaskedMove(Builder, *CI);
     } else {
       llvm_unreachable("Unknown function for CallInst upgrade.");
     }
Index: lib/Target/X86/X86InstrAVX512.td
===================================================================
--- lib/Target/X86/X86InstrAVX512.td
+++ lib/Target/X86/X86InstrAVX512.td
@@ -3320,6 +3320,63 @@
 defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
+
+def : Pat<(v4f32 (X86Movss VR128X:$src0 ,(v4f32 (scalar_to_vector (f32 
+          (X86selects (i1 (trunc GR32:$mask)), (f32 FR32X:$src1), (f32 FR32X:$src2))))))),
+          (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src1, VR128X),
+                            (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v4f32 VR128X:$src0),
+                            (COPY_TO_REGCLASS FR32X:$src2, VR128X)), VR128X)>;
+
+def : Pat<(v4f32 (X86Movss VR128X:$src0 ,(v4f32 (scalar_to_vector (f32 
+          (X86selects (i1 (trunc GR32:$mask)), (f32 FR32X:$src1), (f32 fp32imm0))))))),
+          (COPY_TO_REGCLASS (VMOVSSZrrkz (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v4f32 VR128X:$src0),
+                            (COPY_TO_REGCLASS FR32X:$src1, VR128X)), VR128X)>;
+
+def : Pat<(v2f64 (X86Movsd VR128X:$src0 ,(v2f64 (scalar_to_vector (f64 
+          (X86selects (i1 (trunc GR32:$mask)), (f64 FR64X:$src1), (f64 FR64X:$src2))))))),
+          (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src1, VR128X),
+                            (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v2f64 VR128X:$src0),
+                            (COPY_TO_REGCLASS FR64X:$src2, VR128X)), VR128X)>;
+
+def : Pat<(v2f64 (X86Movsd VR128X:$src0 ,(v2f64 (scalar_to_vector (f64 
+          (X86selects (i1 (trunc GR32:$mask)), (f64 FR64X:$src1), (f64 fp64imm0))))))),
+          (COPY_TO_REGCLASS (VMOVSDZrrkz (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v2f64 VR128X:$src0),
+                            (COPY_TO_REGCLASS FR64X:$src1, VR128X)), VR128X)>;
+
+def : Pat<(masked_store addr:$dst, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+                        (v8f64 (insert_subvector undef, (v4f64 (insert_subvector undef,
+                        (v2f64 VR128X:$src), (i64 0))), (i64 0)))),
+          (VMOVSDZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+                        (COPY_TO_REGCLASS VR128X:$src, FR64X))>; 
+
+def : Pat<(masked_store addr:$dst, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))),
+                        (v16f32 (insert_subvector undef, (v8f32 (insert_subvector undef,
+                        (v4f32 VR128X:$src), (i64 0))), (i64 0)))),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)),
+                        (COPY_TO_REGCLASS VR128X:$src, FR32X))>; 
+
+def : Pat<(v4f32 (extract_subvector (v16f32 (masked_load addr:$srcAddr,
+                        (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), 
+                        (v16f32 (bitconvert (v16i32 immAllZerosV))))), (i64 0))),
+          (VMOVSSZrmkz (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)), addr:$srcAddr)>;
+
+def : Pat<(v4f32 (extract_subvector (v16f32 (masked_load addr:$srcAddr,
+                        (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), 
+                        (v16f32 (insert_subvector undef, (v8f32 (insert_subvector undef,
+                        (v4f32 (X86vzmovl VR128X:$src)), (i64 0))), (i64 0))))), (i64 0))),
+          (VMOVSSZrmk VR128X:$src, (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)), addr:$srcAddr)>;
+
+def : Pat<(v2f64 (extract_subvector (v8f64 (masked_load addr:$srcAddr,
+                        (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), 
+                        (v8f64 (bitconvert (v16i32 immAllZerosV))))), (i64 0))),
+          (VMOVSDZrmkz (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), addr:$srcAddr)>;
+
+def : Pat<(v2f64 (extract_subvector (v8f64 (masked_load addr:$srcAddr,
+                        (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), 
+                        (v8f64 (insert_subvector undef, (v4f64 (insert_subvector undef,
+                        (v2f64 (X86vzmovl VR128X:$src)), (i64 0))), (i64 0))))), (i64 0))),
+          (VMOVSDZrmk VR128X:$src, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), addr:$srcAddr)>;
+
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
            VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
Index: lib/Target/X86/X86InstrFragmentsSIMD.td
===================================================================
--- lib/Target/X86/X86InstrFragmentsSIMD.td
+++ lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -855,6 +855,10 @@
   return N->isExactlyValue(+0.0);
 }]>;
 
+def fp64imm0 : PatLeaf<(f64 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
 def I8Imm : SDNodeXForm<imm, [{
   // Transformation function: get the low 8 bits.
   return getI8Imm((uint8_t)N->getZExtValue(), SDLoc(N));
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -785,10 +785,6 @@
                      X86ISD::FMIN_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::FMIN_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK,
-                     X86ISD::MOVSD, 0),
-  X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK,
-                     X86ISD::MOVSS, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1765,3 +1765,53 @@
   ret <8 x i64> %res2
 }
 
+define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; CHECK-LABEL: test_mm_mask_move_ss:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U)
+  ret <4 x float> %res
+}
+
+
+define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; CHECK-LABEL: test_mm_maskz_move_ss:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; CHECK-LABEL: test_mm_mask_move_sd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; CHECK-LABEL: test_mm_maskz_move_sd:
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U)
+  ret <2 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
Index: test/CodeGen/X86/avx512-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512-intrinsics.ll
+++ test/CodeGen/X86/avx512-intrinsics.ll
@@ -4649,72 +4649,6 @@
 }
 
 declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
-declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  ret <4 x float> %res
-}
-
-define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2)
-  ret <4 x float> %res
-}
-
-define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1)
-  ret <4 x float> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
-define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1)
-  ret <2 x double> %res
-}
-
-define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2)
-  ret <2 x double> %res
-}
-
-define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vmovapd %xmm2, %xmm0
-; CHECK-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  ret <2 x double> %res
-}
 
 declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
 
Index: test/CodeGen/X86/avx512-load-store.ll
===================================================================
--- test/CodeGen/X86/avx512-load-store.ll
+++ test/CodeGen/X86/avx512-load-store.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s
+
+define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_mask_move_ss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm0, %xmm1, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i8 %__U, 1
+  %tobool.i = icmp ne i8 %0, 0
+  %__B.elt.i = extractelement <4 x float> %__B, i32 0
+  %__W.elt.i = extractelement <4 x float> %__W, i32 0
+  %vecext1.i = select i1 %tobool.i, float %__B.elt.i, float %__W.elt.i
+  %vecins.i = insertelement <4 x float> %__A, float %vecext1.i, i32 0
+  ret <4 x float> %vecins.i
+}
+
+define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_maskz_move_ss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i8 %__U, 1
+  %tobool.i = icmp ne i8 %0, 0
+  %vecext.i = extractelement <4 x float> %__B, i32 0
+  %cond.i = select i1 %tobool.i, float %vecext.i, float 0.000000e+00
+  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
+  ret <4 x float> %vecins.i
+}
+
+define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_mask_move_sd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm0, %xmm1, %xmm2 {%k1}
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i8 %__U, 1
+  %tobool.i = icmp ne i8 %0, 0
+  %__B.elt.i = extractelement <2 x double> %__B, i32 0
+  %__W.elt.i = extractelement <2 x double> %__W, i32 0
+  %vecext1.i = select i1 %tobool.i, double %__B.elt.i, double %__W.elt.i
+  %vecins.i = insertelement <2 x double> %__A, double %vecext1.i, i32 0
+  ret <2 x double> %vecins.i
+}
+
+define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
+; CHECK-LABEL: test_mm_maskz_move_sd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %0 = and i8 %__U, 1
+  %tobool.i = icmp ne i8 %0, 0
+  %vecext.i = extractelement <2 x double> %__B, i32 0
+  %cond.i = select i1 %tobool.i, double %vecext.i, double 0.000000e+00
+  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
+  ret <2 x double> %vecins.i
+}
+
+define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
+; CHECK-LABEL: test_mm_mask_store_ss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast float* %__W to <16 x float>*
+  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = and i8 %__U, 1
+  %conv2.i = zext i8 %1 to i16
+  %2 = bitcast i16 %conv2.i to <16 x i1>
+  tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %shuffle.i.i, <16 x float>* %0, i32 16, <16 x i1> %2) #5
+  ret void
+}
+
+define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
+; CHECK-LABEL: test_mm_mask_store_sd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmovsd %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast double* %__W to <8 x double>*
+  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = and i8 %__U, 1
+  %2 = bitcast i8 %1 to <8 x i1>
+  tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %shuffle.i.i, <8 x double>* %0, i32 16, <8 x i1> %2) #5
+  ret void
+}
+
+define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
+; CHECK-LABEL: test_mm_mask_load_ss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
+  %0 = bitcast float* %__W to <16 x float>*
+  %shuffle.i.i = shufflevector <4 x float> %shuffle.i, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = and i8 %__U, 1
+  %conv2.i = zext i8 %1 to i16
+  %2 = bitcast i16 %conv2.i to <16 x i1>
+  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> %shuffle.i.i) #5
+  %shuffle4.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle4.i
+}
+
+define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
+; CHECK-LABEL: test_mm_mask_load_sd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+entry:
+  %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
+  %0 = bitcast double* %__W to <8 x double>*
+  %shuffle.i.i = shufflevector <2 x double> %shuffle5.i, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = and i8 %__U, 1
+  %2 = bitcast i8 %1 to <8 x i1>
+  %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> %shuffle.i.i) #5
+  %shuffle3.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuffle3.i
+}
+
+define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
+; CHECK-LABEL: test_mm_maskz_load_ss:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast float* %__W to <16 x float>*
+  %1 = and i8 %__U, 1
+  %conv2.i = zext i8 %1 to i16
+  %2 = bitcast i16 %conv2.i to <16 x i1>
+  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> zeroinitializer) #5
+  %shuffle.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %shuffle.i
+}
+
+define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
+; CHECK-LABEL: test_mm_maskz_load_sd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+entry:
+  %0 = bitcast double* %__W to <8 x double>*
+  %1 = and i8 %__U, 1
+  %2 = bitcast i8 %1 to <8 x i1>
+  %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> zeroinitializer) #5
+  %shuffle.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %shuffle.i
+}
+
+declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) #3
+
+declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) #3
+
+declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) #4
+
+declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) #4