Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1680,20 +1680,6 @@ [IntrReadMem, IntrArgMemOnly]>; } -// Conditional move ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_move_ss : - GCCBuiltin<"__builtin_ia32_movss_mask">, - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_move_sd : - GCCBuiltin<"__builtin_ia32_movsd_mask">, - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; -} - // Conditional store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -338,6 +338,8 @@ Name.startswith("avx2.pblendd.") || Name.startswith("avx.vbroadcastf128") || Name == "avx2.vbroadcasti128" || + Name.startswith("avx512.mask.move.ss") || + Name.startswith("avx512.mask.move.sd") || Name == "xop.vpcmov" || (Name.startswith("xop.vpcom") && F->arg_size() == 2))) { NewFn = nullptr; @@ -675,6 +677,20 @@ std::max(NumElts, 8U))); } +static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) { + Value* a = CI.getArgOperand(0); + Value* b = CI.getArgOperand(1); + Value* src = CI.getArgOperand(2); + Value* mask = CI.getArgOperand(3); + + Value* andNode = Builder.CreateAnd(mask, APInt(8, 1)); + Value* cmp = Builder.CreateIsNotNull(andNode); + Value* extract1 = Builder.CreateExtractElement(b, (uint64_t)0); + Value* extract2 = Builder.CreateExtractElement(src, (uint64_t)0); + Value* select = Builder.CreateSelect(cmp, extract1, extract2); + return Builder.CreateInsertElement(a, select, (uint64_t)0); +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { @@ -1332,6 +1348,9 @@ Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsX86 && (Name.startswith("avx512.mask.move.ss") || + Name.startswith("avx512.mask.move.sd"))) { + Rep = upgradeMaskedMove(Builder, *CI); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -3320,6 +3320,63 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>, VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>; + +def : Pat<(v4f32 (X86Movss VR128X:$src0 ,(v4f32 (scalar_to_vector (f32 + (X86selects (i1 (trunc GR32:$mask)), (f32 FR32X:$src1), (f32 FR32X:$src2))))))), + (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src1, VR128X), + (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v4f32 VR128X:$src0), + (COPY_TO_REGCLASS FR32X:$src2, VR128X)), VR128X)>; + +def : Pat<(v4f32 (X86Movss VR128X:$src0 ,(v4f32 (scalar_to_vector (f32 + (X86selects (i1 (trunc GR32:$mask)), (f32 FR32X:$src1), (f32 fp32imm0))))))), + (COPY_TO_REGCLASS (VMOVSSZrrkz (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v4f32 VR128X:$src0), + (COPY_TO_REGCLASS FR32X:$src1, VR128X)), VR128X)>; + +def : Pat<(v2f64 (X86Movsd VR128X:$src0 ,(v2f64 (scalar_to_vector (f64 + (X86selects (i1 (trunc GR32:$mask)), (f64 FR64X:$src1), (f64 FR64X:$src2))))))), + (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src1, VR128X), + (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v2f64 VR128X:$src0), + (COPY_TO_REGCLASS FR64X:$src2, VR128X)), VR128X)>; + +def : Pat<(v2f64 (X86Movsd VR128X:$src0 ,(v2f64 (scalar_to_vector (f64 + (X86selects (i1 (trunc GR32:$mask)), (f64 FR64X:$src1), (f64 fp64imm0))))))), + (COPY_TO_REGCLASS (VMOVSDZrrkz (COPY_TO_REGCLASS GR32:$mask, VK1WM), (v2f64 VR128X:$src0), + (COPY_TO_REGCLASS FR64X:$src1, VR128X)), VR128X)>; + +def : Pat<(masked_store addr:$dst, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (v8f64 (insert_subvector undef, (v4f64 (insert_subvector undef, + (v2f64 VR128X:$src), (i64 0))), (i64 0)))), + (VMOVSDZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR64X))>; + +def : Pat<(masked_store addr:$dst, (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), + (v16f32 (insert_subvector undef, (v8f32 (insert_subvector undef, + (v4f32 VR128X:$src), (i64 0))), (i64 0)))), + (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)), + (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + +def : Pat<(v4f32 (extract_subvector (v16f32 (masked_load addr:$srcAddr, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), + (v16f32 (bitconvert (v16i32 immAllZerosV))))), (i64 0))), + (VMOVSSZrmkz (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)), addr:$srcAddr)>; + +def : Pat<(v4f32 (extract_subvector (v16f32 (masked_load addr:$srcAddr, + (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), + (v16f32 (insert_subvector undef, (v8f32 (insert_subvector undef, + (v4f32 (X86vzmovl VR128X:$src)), (i64 0))), (i64 0))))), (i64 0))), + (VMOVSSZrmk VR128X:$src, (i1 (COPY_TO_REGCLASS GR32:$mask, VK1WM)), addr:$srcAddr)>; + +def : Pat<(v2f64 (extract_subvector (v8f64 (masked_load addr:$srcAddr, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (v8f64 (bitconvert (v16i32 immAllZerosV))))), (i64 0))), + (VMOVSDZrmkz (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), addr:$srcAddr)>; + +def : Pat<(v2f64 (extract_subvector (v8f64 (masked_load addr:$srcAddr, + (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), + (v8f64 (insert_subvector undef, (v4f64 (insert_subvector undef, + (v2f64 (X86vzmovl VR128X:$src)), (i64 0))), (i64 0))))), (i64 0))), + (VMOVSDZrmk VR128X:$src, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)), addr:$srcAddr)>; + def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -855,6 +855,10 @@ return N->isExactlyValue(+0.0); }]>; +def fp64imm0 : PatLeaf<(f64 fpimm), [{ + return N->isExactlyValue(+0.0); +}]>; + def I8Imm : SDNodeXFormgetZExtValue(), SDLoc(N)); Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -785,10 +785,6 @@ X86ISD::FMIN_RND, 0), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSD, 0), - X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSS, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1765,3 +1765,53 @@ ret <8 x i64> %res2 } +define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; CHECK-LABEL: test_mm_mask_move_ss: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U) + ret <4 x float> %res +} + + +define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; CHECK-LABEL: test_mm_maskz_move_ss: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U) + ret <4 x float> %res +} + +define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; CHECK-LABEL: test_mm_mask_move_sd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U) + ret <2 x double> %res +} + +define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; CHECK-LABEL: test_mm_maskz_move_sd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8) + +declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8) Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4649,72 +4649,6 @@ } declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) -declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8) - -define <4 x float>@test_int_x86_avx512_mask_move_ss_rrk(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrk: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) - ret <4 x float> %res -} - -define <4 x float>@test_int_x86_avx512_mask_move_ss_rrkz(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rrkz: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 %x2) - ret <4 x float> %res -} - -define <4 x float>@test_int_x86_avx512_mask_move_ss_rr(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_ss_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> zeroinitializer, i8 -1) - ret <4 x float> %res -} - -declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8) -define <2 x double>@test_int_x86_avx512_mask_move_sd_rr(<2 x double> %x0, <2 x double> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rr: -; CHECK: ## BB#0: -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 -1) - ret <2 x double> %res -} - -define <2 x double>@test_int_x86_avx512_mask_move_sd_rrkz(<2 x double> %x0, <2 x double> %x1, i8 %x2) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrkz: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> zeroinitializer, i8 %x2) - ret <2 x double> %res -} - -define <2 x double>@test_int_x86_avx512_mask_move_sd_rrk(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { -; CHECK-LABEL: test_int_x86_avx512_mask_move_sd_rrk: -; CHECK: ## BB#0: -; CHECK-NEXT: andl $1, %edi -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovapd %xmm2, %xmm0 -; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) - ret <2 x double> %res -} declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16) Index: test/CodeGen/X86/avx512-load-store.ll =================================================================== --- test/CodeGen/X86/avx512-load-store.ll +++ test/CodeGen/X86/avx512-load-store.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s + +define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_mask_move_ss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = and i8 %__U, 1 + %tobool.i = icmp ne i8 %0, 0 + %__B.elt.i = extractelement <4 x float> %__B, i32 0 + %__W.elt.i = extractelement <4 x float> %__W, i32 0 + %vecext1.i = select i1 %tobool.i, float %__B.elt.i, float %__W.elt.i + %vecins.i = insertelement <4 x float> %__A, float %vecext1.i, i32 0 + ret <4 x float> %vecins.i +} + +define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_maskz_move_ss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %0 = and i8 %__U, 1 + %tobool.i = icmp ne i8 %0, 0 + %vecext.i = extractelement <4 x float> %__B, i32 0 + %cond.i = select i1 %tobool.i, float %vecext.i, float 0.000000e+00 + %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 + ret <4 x float> %vecins.i +} + +define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_mask_move_sd: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovapd %xmm2, %xmm0 +; CHECK-NEXT: retq +entry: + %0 = and i8 %__U, 1 + %tobool.i = icmp ne i8 %0, 0 + %__B.elt.i = extractelement <2 x double> %__B, i32 0 + %__W.elt.i = extractelement <2 x double> %__W, i32 0 + %vecext1.i = select i1 %tobool.i, double %__B.elt.i, double %__W.elt.i + %vecins.i = insertelement <2 x double> %__A, double %vecext1.i, i32 0 + ret <2 x double> %vecins.i +} + +define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 { +; CHECK-LABEL: test_mm_maskz_move_sd: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %0 = and i8 %__U, 1 + %tobool.i = icmp ne i8 %0, 0 + %vecext.i = extractelement <2 x double> %__B, i32 0 + %cond.i = select i1 %tobool.i, double %vecext.i, double 0.000000e+00 + %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 + ret <2 x double> %vecins.i +} + +define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 { +; CHECK-LABEL: test_mm_mask_store_ss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovss %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq +entry: + %0 = bitcast float* %__W to <16 x float>* + %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> + %1 = and i8 %__U, 1 + %conv2.i = zext i8 %1 to i16 + %2 = bitcast i16 %conv2.i to <16 x i1> + tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %shuffle.i.i, <16 x float>* %0, i32 16, <16 x i1> %2) #5 + ret void +} + +define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 { +; CHECK-LABEL: test_mm_mask_store_sd: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovsd %xmm0, (%rdi) {%k1} +; CHECK-NEXT: retq +entry: + %0 = bitcast double* %__W to <8 x double>* + %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> + %1 = and i8 %__U, 1 + %2 = bitcast i8 %1 to <8 x i1> + tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %shuffle.i.i, <8 x double>* %0, i32 16, <8 x i1> %2) #5 + ret void +} + +define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm_mask_load_ss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss (%rsi), %xmm0 {%k1} +; CHECK-NEXT: retq +entry: + %shuffle.i = shufflevector <4 x float> %__A, <4 x float> , <4 x i32> + %0 = bitcast float* %__W to <16 x float>* + %shuffle.i.i = shufflevector <4 x float> %shuffle.i, <4 x float> undef, <16 x i32> + %1 = and i8 %__U, 1 + %conv2.i = zext i8 %1 to i16 + %2 = bitcast i16 %conv2.i to <16 x i1> + %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> %shuffle.i.i) #5 + %shuffle4.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> + ret <4 x float> %shuffle4.i +} + +define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm_mask_load_sd: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd (%rsi), %xmm0 {%k1} +; CHECK-NEXT: retq +entry: + %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1 + %0 = bitcast double* %__W to <8 x double>* + %shuffle.i.i = shufflevector <2 x double> %shuffle5.i, <2 x double> undef, <8 x i32> + %1 = and i8 %__U, 1 + %2 = bitcast i8 %1 to <8 x i1> + %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> %shuffle.i.i) #5 + %shuffle3.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> + ret <2 x double> %shuffle3.i +} + +define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm_maskz_load_ss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss (%rsi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %0 = bitcast float* %__W to <16 x float>* + %1 = and i8 %__U, 1 + %conv2.i = zext i8 %1 to i16 + %2 = bitcast i16 %conv2.i to <16 x i1> + %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> zeroinitializer) #5 + %shuffle.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> + ret <4 x float> %shuffle.i +} + +define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 { +; CHECK-LABEL: test_mm_maskz_load_sd: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd (%rsi), %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %0 = bitcast double* %__W to <8 x double>* + %1 = and i8 %__U, 1 + %2 = bitcast i8 %1 to <8 x i1> + %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> zeroinitializer) #5 + %shuffle.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> + ret <2 x double> %shuffle.i +} + +declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) #3 + +declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) #3 + +declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) #4 + +declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) #4