Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -1680,20 +1680,6 @@ [IntrReadMem, IntrArgMemOnly]>; } -// Conditional move ops -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_move_ss : - GCCBuiltin<"__builtin_ia32_movss_mask">, - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_move_sd : - GCCBuiltin<"__builtin_ia32_movsd_mask">, - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; -} - // Conditional store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">, Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -338,6 +338,8 @@ Name.startswith("avx2.pblendd.") || Name.startswith("avx.vbroadcastf128") || Name == "avx2.vbroadcasti128" || + Name.startswith("avx512.mask.move.ss") || + Name.startswith("avx512.mask.move.sd") || Name == "xop.vpcmov" || (Name.startswith("xop.vpcom") && F->arg_size() == 2))) { NewFn = nullptr; @@ -675,6 +677,20 @@ std::max(NumElts, 8U))); } +static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) { + Value* A = CI.getArgOperand(0); + Value* B = CI.getArgOperand(1); + Value* Src = CI.getArgOperand(2); + Value* Mask = CI.getArgOperand(3); + + Value* AndNode = Builder.CreateAnd(Mask, APInt(8, 1)); + Value* Cmp = Builder.CreateIsNotNull(AndNode); + Value* Extract1 = Builder.CreateExtractElement(B, (uint64_t)0); + Value* Extract2 = Builder.CreateExtractElement(Src, (uint64_t)0); + Value* Select = Builder.CreateSelect(Cmp, Extract1, Extract2); + return Builder.CreateInsertElement(A, Select, (uint64_t)0); +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { @@ -1332,6 +1348,9 @@ Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsX86 && (Name.startswith("avx512.mask.move.ss") || + Name.startswith("avx512.mask.move.sd"))) { + Rep = upgradeMaskedMove(Builder, *CI); } else { llvm_unreachable("Unknown function for CallInst upgrade."); } Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -788,10 +788,6 @@ X86ISD::FMIN_RND, 0), X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_move_sd, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSD, 0), - X86_INTRINSIC_DATA(avx512_mask_move_ss, INTR_TYPE_SCALAR_MASK, - X86ISD::MOVSS, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, X86ISD::FMUL_RND), X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL, Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1765,3 +1765,51 @@ ret <8 x i64> %res2 } +define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; CHECK-LABEL: test_mm_mask_move_ss: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U) + ret <4 x float> %res +} + + +define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; CHECK-LABEL: test_mm_maskz_move_ss: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U) + ret <4 x float> %res +} + +define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; CHECK-LABEL: test_mm_mask_move_sd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U) + ret <2 x double> %res +} + +define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; CHECK-LABEL: test_mm_maskz_move_sd: +; CHECK: ## BB#0: ## %entry +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq +entry: + %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U) + ret <2 x double> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8) + +declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)