Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -686,16 +686,16 @@ // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">, + def int_x86_sse41_round_ss : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">, + def int_x86_sse41_round_ps : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">, + def int_x86_sse41_round_sd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">, + def int_x86_sse41_round_pd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>; } @@ -972,10 +972,10 @@ def int_x86_avx_rcp_ps_256 : GCCBuiltin<"__builtin_ia32_rcpps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">, + def int_x86_avx_round_pd_256 : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">, + def int_x86_avx_round_ps_256 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem]>; } @@ -3788,22 +3788,22 @@ [llvm_v8i64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">, + def int_x86_avx512_mask_rndscale_pd_128 : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">, + def int_x86_avx512_mask_rndscale_pd_256 : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">, + def int_x86_avx512_mask_rndscale_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">, + def int_x86_avx512_mask_rndscale_ps_128 : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">, + def int_x86_avx512_mask_rndscale_ps_256 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">, + def int_x86_avx512_mask_rndscale_ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">, @@ -3943,11 +3943,11 @@ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round_mask">, + def int_x86_avx512_mask_rndscale_ss : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round_mask">, + def int_x86_avx512_mask_rndscale_sd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -38710,9 +38710,31 @@ // TODO: SimplifyDemandedBits instead? if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse()) if (auto *C = dyn_cast(Src.getOperand(1))) - if (C->getAPIntValue().isOneValue()) - return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, - Src.getOperand(0)); + if (C->getAPIntValue().isOneValue()) { + SDValue Mask = Src.getOperand(0); + if (Mask.getOpcode() == ISD::TRUNCATE && + Mask.getOperand(0).getValueType() != MVT::i16) + Mask = Mask.getOperand(0); + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask); + } + + // The result of AND may also be truncated. This occurs in code for lowered + // masked scalar intrinsics. + if (VT == MVT::v1i1 && Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && + Src.getOperand(0).getOpcode() == ISD::AND && + Src.getOperand(0).hasOneUse()) + if (auto *C = dyn_cast(Src.getOperand(0).getOperand(1))) + if (C->getAPIntValue().isOneValue()) { + SDValue Mask = Src.getOperand(0).getOperand(0); + if (Mask.getOpcode() == ISD::TRUNCATE && + Mask.getOperand(0).getValueType() != MVT::i16) + Mask = Mask.getOperand(0); + // Check if the initial value is an i16. scalar_to_vector fails to + // select for that type, so the combine should be aborted. + if (Mask.getValueType() == MVT::i16) + return SDValue(); + return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1, Mask); + } return SDValue(); } Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -8599,6 +8599,40 @@ } } +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; +defm : avx512_masked_scalar_imm; + + //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- @@ -9739,10 +9773,14 @@ let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>; +def : Pat<(v16f32 (vselect VK16WM:$mask, (ffloor VR512:$src), VR512:$dst)), + (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0x9))>; def : Pat<(v16f32 (fnearbyint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; def : Pat<(v16f32 (fceil VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>; +def : Pat<(v16f32 (vselect VK16WM:$mask, (fceil VR512:$src), VR512:$dst)), + (VRNDSCALEPSZrrik VR512:$dst, VK16WM:$mask, VR512:$src, (i32 0xA))>; def : Pat<(v16f32 (frint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; def : Pat<(v16f32 (ftrunc VR512:$src)), @@ -9750,10 +9788,14 @@ def : Pat<(v8f64 (ffloor VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>; +def : Pat<(v8f64 (vselect VK8WM:$mask, (ffloor VR512:$src), VR512:$dst)), + (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0x9))>; def : Pat<(v8f64 (fnearbyint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; def : Pat<(v8f64 (fceil VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>; +def : Pat<(v8f64 (vselect VK8WM:$mask, (fceil VR512:$src), VR512:$dst)), + (VRNDSCALEPDZrrik VR512:$dst, VK8WM:$mask, VR512:$src, (i32 0xA))>; def : Pat<(v8f64 (frint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; def : Pat<(v8f64 (ftrunc VR512:$src)), @@ -9763,10 +9805,14 @@ let Predicates = [HasVLX] in { def : Pat<(v4f32 (ffloor VR128X:$src)), (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>; +def : Pat<(v4f32 (vselect VK4WM:$mask, (ffloor VR128X:$src), VR128X:$dst)), + (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128X:$src)), (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>; def : Pat<(v4f32 (fceil VR128X:$src)), (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>; +def : Pat<(v4f32 (vselect VK4WM:$mask, (fceil VR128X:$src), VR128X:$dst)), + (VRNDSCALEPSZ128rrik VR128X:$dst, VK4WM:$mask, VR128X:$src, (i32 0xA))>; def : Pat<(v4f32 (frint VR128X:$src)), (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>; def : Pat<(v4f32 (ftrunc VR128X:$src)), @@ -9774,10 +9820,14 @@ def : Pat<(v2f64 (ffloor VR128X:$src)), (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>; +def : Pat<(v2f64 (vselect VK2WM:$mask, (ffloor VR128X:$src), VR128X:$dst)), + (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0x9))>; def : Pat<(v2f64 (fnearbyint VR128X:$src)), (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>; def : Pat<(v2f64 (fceil VR128X:$src)), (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>; +def : Pat<(v2f64 (vselect VK2WM:$mask, (fceil VR128X:$src), VR128X:$dst)), + (VRNDSCALEPDZ128rrik VR128X:$dst, VK2WM:$mask, VR128X:$src, (i32 0xA))>; def : Pat<(v2f64 (frint VR128X:$src)), (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>; def : Pat<(v2f64 (ftrunc VR128X:$src)), @@ -9785,10 +9835,14 @@ def : Pat<(v8f32 (ffloor VR256X:$src)), (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>; +def : Pat<(v8f32 (vselect VK8WM:$mask, (ffloor VR256X:$src), VR256X:$dst)), + (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0x9))>; def : Pat<(v8f32 (fnearbyint VR256X:$src)), (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>; def : Pat<(v8f32 (fceil VR256X:$src)), (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>; +def : Pat<(v8f32 (vselect VK8WM:$mask, (fceil VR256X:$src), VR256X:$dst)), + (VRNDSCALEPSZ256rrik VR256X:$dst, VK8WM:$mask, VR256X:$src, (i32 0xA))>; def : Pat<(v8f32 (frint VR256X:$src)), (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>; def : Pat<(v8f32 (ftrunc VR256X:$src)), @@ -9796,10 +9850,14 @@ def : Pat<(v4f64 (ffloor VR256X:$src)), (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>; +def : Pat<(v4f64 (vselect VK4WM:$mask, (ffloor VR256X:$src), VR256X:$dst)), + (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0x9))>; def : Pat<(v4f64 (fnearbyint VR256X:$src)), (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>; def : Pat<(v4f64 (fceil VR256X:$src)), (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>; +def : Pat<(v4f64 (vselect VK4WM:$mask, (fceil VR256X:$src), VR256X:$dst)), + (VRNDSCALEPDZ256rrik VR256X:$dst, VK4WM:$mask, VR256X:$src, (i32 0xA))>; def : Pat<(v4f64 (frint VR256X:$src)), (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>; def : Pat<(v4f64 (ftrunc VR256X:$src)), Index: llvm/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/lib/Target/X86/X86InstrSSE.td +++ llvm/lib/Target/X86/X86InstrSSE.td @@ -5731,6 +5731,15 @@ (ROUNDPDr VR128:$src, (i32 0xB))>; } +defm : scalar_unary_math_imm_patterns; +defm : scalar_unary_math_imm_patterns; +defm : scalar_unary_math_imm_patterns; +defm : scalar_unary_math_imm_patterns; + //===----------------------------------------------------------------------===// // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/X86/vec_floor.ll =================================================================== --- llvm/test/CodeGen/X86/vec_floor.ll +++ llvm/test/CodeGen/X86/vec_floor.ll @@ -770,3 +770,425 @@ %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> ) ret <4 x float> %t } + +define <4 x float> @ceil_ss(<4 x float> %x, <4 x float> %y) nounwind { +; SSE41-LABEL: ceil_ss: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundss $2, %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_ss: +; AVX: ## %bb.0: +; AVX-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = extractelement <4 x float> %x, i32 0 + %call = call float @llvm.ceil.f32(float %s) + %res = insertelement <4 x float> %y, float %call, i32 0 + ret <4 x float> %res +} +declare float @llvm.ceil.f32(float %s) + +define <2 x double> @ceil_sd(<2 x double> %x, <2 x double> %y) nounwind { +; SSE41-LABEL: ceil_sd: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundsd $2, %xmm0, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_sd: +; AVX: ## %bb.0: +; AVX-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_sd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vroundsd $2, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %s = extractelement <2 x double> %x, i32 0 + %call = call double @llvm.ceil.f64(double %s) + %res = insertelement <2 x double> %y, double %call, i32 0 + ret <2 x double> %res +} +declare double @llvm.ceil.f64(double %s) + +define <4 x float> @ceil_mask_128_ps(<4 x float> %x, <4 x float> %y) nounwind { +; SSE41-LABEL: ceil_mask_128_ps: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundps $10, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_128_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vroundps $10, %xmm0, %xmm0 +; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_128_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaleps $10, %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovaps %xmm1, %xmm0 +; AVX512-NEXT: retq + %k = fcmp oeq <4 x float> %x, %y + %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) + %res = select <4 x i1> %k, <4 x float> %call, <4 x float> %y + ret <4 x float> %res +} + +define <2 x double> @ceil_mask_128_pd(<2 x double> %x, <2 x double> %y) nounwind { +; SSE41-LABEL: ceil_mask_128_pd: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundpd $10, %xmm0, %xmm2 +; SSE41-NEXT: cmpeqpd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_128_pd: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vroundpd $10, %xmm0, %xmm0 +; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_128_pd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscalepd $10, %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovapd %xmm1, %xmm0 +; AVX512-NEXT: retq + %k = fcmp oeq <2 x double> %x, %y + %call = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) + %res = select <2 x i1> %k, <2 x double> %call, <2 x double> %y + ret <2 x double> %res +} + +define <8 x float> @ceil_mask_256_ps(<8 x float> %x, <8 x float> %y) nounwind { +; SSE41-LABEL: ceil_mask_256_ps: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundps $10, %xmm1, %xmm4 +; SSE41-NEXT: cmpeqps %xmm3, %xmm1 +; SSE41-NEXT: roundps $10, %xmm0, %xmm5 +; SSE41-NEXT: cmpeqps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: movaps %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_256_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqps %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_256_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqps %ymm1, %ymm0, %k1 +; AVX512-NEXT: vrndscaleps $10, %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovaps %ymm1, %ymm0 +; AVX512-NEXT: retq + %k = fcmp oeq <8 x float> %x, %y + %call = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) + %res = select <8 x i1> %k, <8 x float> %call, <8 x float> %y + ret <8 x float> %res +} + +define <4 x double> @ceil_mask_256_pd(<4 x double> %x, <4 x double> %y) nounwind { +; SSE41-LABEL: ceil_mask_256_pd: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundpd $10, %xmm1, %xmm4 +; SSE41-NEXT: cmpeqpd %xmm3, %xmm1 +; SSE41-NEXT: roundpd $10, %xmm0, %xmm5 +; SSE41-NEXT: cmpeqpd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: movapd %xmm3, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_256_pd: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_256_pd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqpd %ymm1, %ymm0, %k1 +; AVX512-NEXT: vrndscalepd $10, %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovapd %ymm1, %ymm0 +; AVX512-NEXT: retq + %k = fcmp oeq <4 x double> %x, %y + %call = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) + %res = select <4 x i1> %k, <4 x double> %call, <4 x double> %y + ret <4 x double> %res +} + +define <16 x float> @ceil_mask_512_ps(<16 x float> %x, <16 x float> %y) nounwind { +; SSE41-LABEL: ceil_mask_512_ps: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundps $10, %xmm3, %xmm8 +; SSE41-NEXT: cmpeqps %xmm7, %xmm3 +; SSE41-NEXT: roundps $10, %xmm2, %xmm9 +; SSE41-NEXT: cmpeqps %xmm6, %xmm2 +; SSE41-NEXT: roundps $10, %xmm1, %xmm10 +; SSE41-NEXT: cmpeqps %xmm5, %xmm1 +; SSE41-NEXT: roundps $10, %xmm0, %xmm11 +; SSE41-NEXT: cmpeqps %xmm4, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movaps %xmm3, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm6, %xmm2 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_512_ps: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqps %ymm3, %ymm1, %ymm4 +; AVX-NEXT: vcmpeqps %ymm2, %ymm0, %ymm5 +; AVX-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vblendvps %ymm4, %ymm1, %ymm3, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_512_ps: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqps %zmm1, %zmm0, %k1 +; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %k = fcmp oeq <16 x float> %x, %y + %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) + %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y + ret <16 x float> %res +} + +define <8 x double> @ceil_mask_512_pd(<8 x double> %x, <8 x double> %y) nounwind { +; SSE41-LABEL: ceil_mask_512_pd: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundpd $10, %xmm3, %xmm8 +; SSE41-NEXT: cmpeqpd %xmm7, %xmm3 +; SSE41-NEXT: roundpd $10, %xmm2, %xmm9 +; SSE41-NEXT: cmpeqpd %xmm6, %xmm2 +; SSE41-NEXT: roundpd $10, %xmm1, %xmm10 +; SSE41-NEXT: cmpeqpd %xmm5, %xmm1 +; SSE41-NEXT: roundpd $10, %xmm0, %xmm11 +; SSE41-NEXT: cmpeqpd %xmm4, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 +; SSE41-NEXT: movapd %xmm2, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 +; SSE41-NEXT: movapd %xmm3, %xmm0 +; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 +; SSE41-NEXT: movapd %xmm4, %xmm0 +; SSE41-NEXT: movapd %xmm5, %xmm1 +; SSE41-NEXT: movapd %xmm6, %xmm2 +; SSE41-NEXT: movapd %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_512_pd: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqpd %ymm3, %ymm1, %ymm4 +; AVX-NEXT: vcmpeqpd %ymm2, %ymm0, %ymm5 +; AVX-NEXT: vroundpd $10, %ymm1, %ymm1 +; AVX-NEXT: vroundpd $10, %ymm0, %ymm0 +; AVX-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_512_pd: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqpd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vrndscalepd $10, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovapd %zmm1, %zmm0 +; AVX512-NEXT: retq + %k = fcmp oeq <8 x double> %x, %y + %call = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) + %res = select <8 x i1> %k, <8 x double> %call, <8 x double> %y + ret <8 x double> %res +} + +define <4 x float> @ceil_mask_ss(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind { +; SSE41-LABEL: ceil_mask_ss: +; SSE41: ## %bb.0: +; SSE41-NEXT: testb $1, %dil +; SSE41-NEXT: je LBB44_2 +; SSE41-NEXT: ## %bb.1: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: roundss $10, %xmm0, %xmm2 +; SSE41-NEXT: LBB44_2: +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_ss: +; AVX: ## %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: je LBB44_2 +; AVX-NEXT: ## %bb.1: +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: LBB44_2: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_ss: +; AVX512: ## %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq + %xk = zext i16 %k to i32 + %mask = and i32 %xk, 1 + %nmask = icmp eq i32 %mask, 0 + %s = extractelement <4 x float> %x, i64 0 + %call = tail call float @llvm.ceil.f32(float %s) + %dst = extractelement <4 x float> %w, i64 0 + %low = select i1 %nmask, float %dst, float %call + %res = insertelement <4 x float> %y, float %low, i64 0 + ret <4 x float> %res +} + +define <2 x double> @ceil_mask_sd(<2 x double> %x, <2 x double> %y, <2 x double> %w, i16 %k) nounwind { +; SSE41-LABEL: ceil_mask_sd: +; SSE41: ## %bb.0: +; SSE41-NEXT: testb $1, %dil +; SSE41-NEXT: je LBB45_2 +; SSE41-NEXT: ## %bb.1: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: roundsd $10, %xmm0, %xmm2 +; SSE41-NEXT: LBB45_2: +; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1] +; SSE41-NEXT: movapd %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_sd: +; AVX: ## %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: je LBB45_2 +; AVX-NEXT: ## %bb.1: +; AVX-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: LBB45_2: +; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_sd: +; AVX512: ## %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vrndscalesd $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovapd %xmm2, %xmm0 +; AVX512-NEXT: retq + %xk = zext i16 %k to i32 + %mask = and i32 %xk, 1 + %nmask = icmp eq i32 %mask, 0 + %s = extractelement <2 x double> %x, i64 0 + %call = tail call double @llvm.ceil.f64(double %s) + %dst = extractelement <2 x double> %w, i64 0 + %low = select i1 %nmask, double %dst, double %call + %res = insertelement <2 x double> %y, double %low, i64 0 + ret <2 x double> %res +} + +define <4 x float> @ceil_mask_ss_trunc(<4 x float> %x, <4 x float> %y, <4 x float> %w, i16 %k) nounwind { +; SSE41-LABEL: ceil_mask_ss_trunc: +; SSE41: ## %bb.0: +; SSE41-NEXT: testb $1, %dil +; SSE41-NEXT: je LBB46_2 +; SSE41-NEXT: ## %bb.1: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: roundss $10, %xmm0, %xmm2 +; SSE41-NEXT: LBB46_2: +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_ss_trunc: +; AVX: ## %bb.0: +; AVX-NEXT: testb $1, %dil +; AVX-NEXT: je LBB46_2 +; AVX-NEXT: ## %bb.1: +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: LBB46_2: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_ss_trunc: +; AVX512: ## %bb.0: +; AVX512-NEXT: kmovw %edi, %k1 +; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq + %mask = trunc i16 %k to i1 + %s = extractelement <4 x float> %x, i64 0 + %call = tail call float @llvm.ceil.f32(float %s) + %dst = extractelement <4 x float> %w, i64 0 + %low = select i1 %mask, float %call, float %dst + %res = insertelement <4 x float> %y, float %low, i64 0 + ret <4 x float> %res +} + +define <4 x float> @ceil_mask_ss_mask8(<4 x float> %x, <4 x float> %y, <4 x float> %w) nounwind { +; SSE41-LABEL: ceil_mask_ss_mask8: +; SSE41: ## %bb.0: +; SSE41-NEXT: movaps %xmm0, %xmm3 +; SSE41-NEXT: cmpeqps %xmm1, %xmm3 +; SSE41-NEXT: pextrb $0, %xmm3, %eax +; SSE41-NEXT: testb $1, %al +; SSE41-NEXT: je LBB47_2 +; SSE41-NEXT: ## %bb.1: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: roundss $10, %xmm0, %xmm2 +; SSE41-NEXT: LBB47_2: +; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask_ss_mask8: +; AVX: ## %bb.0: +; AVX-NEXT: vcmpeqps %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vpextrb $0, %xmm3, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: je LBB47_2 +; AVX-NEXT: ## %bb.1: +; AVX-NEXT: vroundss $10, %xmm0, %xmm0, %xmm2 +; AVX-NEXT: LBB47_2: +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask_ss_mask8: +; AVX512: ## %bb.0: +; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k1 +; AVX512-NEXT: vrndscaless $2, %xmm0, %xmm1, %xmm2 {%k1} +; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: retq + %mask1 = fcmp oeq <4 x float> %x, %y + %mask = extractelement <4 x i1> %mask1, i64 0 + %s = extractelement <4 x float> %x, i64 0 + %call = tail call float @llvm.ceil.f32(float %s) + %dst = extractelement <4 x float> %w, i64 0 + %low = select i1 %mask, float %call, float %dst + %res = insertelement <4 x float> %y, float %low, i64 0 + ret <4 x float> %res +}