Index: llvm/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsX86.td +++ llvm/include/llvm/IR/IntrinsicsX86.td @@ -706,16 +706,16 @@ // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">, + def int_x86_sse41_round_ss : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">, + def int_x86_sse41_round_ps : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">, + def int_x86_sse41_round_sd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">, + def int_x86_sse41_round_pd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem]>; } @@ -999,10 +999,10 @@ def int_x86_avx_rcp_ps_256 : GCCBuiltin<"__builtin_ia32_rcpps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">, + def int_x86_avx_round_pd_256 : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">, + def int_x86_avx_round_ps_256 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem]>; } @@ -4227,22 +4227,22 @@ [llvm_v8i64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">, + def int_x86_avx512_mask_rndscale_pd_128 : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">, + def int_x86_avx512_mask_rndscale_pd_256 : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">, + def int_x86_avx512_mask_rndscale_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">, + def int_x86_avx512_mask_rndscale_ps_128 : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">, + def int_x86_avx512_mask_rndscale_ps_256 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">, + def int_x86_avx512_mask_rndscale_ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">, Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -30480,6 +30480,37 @@ return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); } +// For SSE4.1 and AVX512, we may want to combine VRNDSCALES from +// vector_shuffle<{0,3}|{0,5,6,7}> (fceil|ffloor A), B +// patterns. +static SDValue combineShuffleFloorCeil(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!isa(N)) + return SDValue(); + EVT VT = N->getValueType(0); + unsigned Num = VT.getVectorNumElements(); + if (Num * VT.getScalarSizeInBits() != 128 || !Subtarget.hasSSE41()) + return SDValue(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + int Op = N0.getOpcode(); + if ((Num != 2 && Num != 4) || (Op != ISD::FCEIL && Op != ISD::FFLOOR)) + return SDValue(); + + // The mask being matched here is equivalent to a 0...01 select mask. + ShuffleVectorSDNode *SVOp = cast(N); + if (SVOp->getMaskElt(0) != 0) + return SDValue(); + for (unsigned i = 1; i < Num; ++i) + if (SVOp->getMaskElt(i) != Num + i) + return SDValue(); + + int Imm = (Op == ISD::FCEIL) ? 2 : 1; + SDLoc DL(N); + return DAG.getNode(X86ISD::VRNDSCALES, DL, VT, N1, N0.getOperand(0), + DAG.getConstant(Imm, DL, MVT::i32)); +} + // We are looking for a shuffle where both sources are concatenated with undef // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so // if we can express this as a single-source shuffle, that's preferable. @@ -30653,6 +30684,9 @@ EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true)) return LD; + if (SDValue RndScale = combineShuffleFloorCeil(N, DAG, Subtarget)) + return RndScale; + // For AVX2, we sometimes want to combine // (vector_shuffle (concat_vectors t1, undef) // (concat_vectors t2, undef)) Index: llvm/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/lib/Target/X86/X86InstrAVX512.td +++ llvm/lib/Target/X86/X86InstrAVX512.td @@ -9370,10 +9370,14 @@ let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>; +def : Pat<(v16f32 (vselect v16f32_info.KRCWM:$mask, (ffloor VR512:$src), VR512:$dst)), + (VRNDSCALEPSZrrik VR512:$dst, v16f32_info.KRCWM:$mask, VR512:$src, (i32 0x9))>; def : Pat<(v16f32 (fnearbyint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0xC))>; def : Pat<(v16f32 (fceil VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0xA))>; +def : Pat<(v16f32 (vselect v16f32_info.KRCWM:$mask, (fceil VR512:$src), VR512:$dst)), + (VRNDSCALEPSZrrik VR512:$dst, v16f32_info.KRCWM:$mask, VR512:$src, (i32 0xA))>; def : Pat<(v16f32 (frint VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x4))>; def : Pat<(v16f32 (ftrunc VR512:$src)), @@ -9381,10 +9385,14 @@ def : Pat<(v8f64 (ffloor VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0x9))>; +def : Pat<(v8f64 (vselect v8f64_info.KRCWM:$mask, (ffloor VR512:$src), VR512:$dst)), + (VRNDSCALEPDZrrik VR512:$dst, v8f64_info.KRCWM:$mask, VR512:$src, (i32 0x9))>; def : Pat<(v8f64 (fnearbyint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xC))>; def : Pat<(v8f64 (fceil VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xA))>; +def : Pat<(v8f64 (vselect v8f64_info.KRCWM:$mask, (fceil VR512:$src), VR512:$dst)), + (VRNDSCALEPDZrrik VR512:$dst, v8f64_info.KRCWM:$mask, VR512:$src, (i32 0xA))>; def : Pat<(v8f64 (frint VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0x4))>; def : Pat<(v8f64 (ftrunc VR512:$src)), Index: llvm/test/CodeGen/X86/vec_floor.ll =================================================================== --- llvm/test/CodeGen/X86/vec_floor.ll +++ llvm/test/CodeGen/X86/vec_floor.ll @@ -770,3 +770,107 @@ %t = call <4 x float> @llvm.trunc.v4f32(<4 x float> ) ret <4 x float> %t } + +define <4 x float> @ceil_ss_select(<4 x float> %x, <4 x float> %y) nounwind { +; SSE41-LABEL: ceil_ss_select: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundss $2, %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_ss_select: +; AVX: ## %bb.0: +; AVX-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_ss_select: +; AVX512: ## %bb.0: +; AVX512-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) + %res = select <4 x i1> , <4 x float> %call, <4 x float> %y + ret <4 x float> %res +} + +define <4 x float> @ceis_ss_shuffle(<4 x float> %x, <4 x float> %y) nounwind { +; SSE41-LABEL: ceis_ss_shuffle: +; SSE41: ## %bb.0: +; SSE41-NEXT: roundss $2, %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceis_ss_shuffle: +; AVX: ## %bb.0: +; AVX-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceis_ss_shuffle: +; AVX512: ## %bb.0: +; AVX512-NEXT: vroundss $2, %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retq + %call = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) + %res = shufflevector <4 x float> %call, <4 x float> %y, <4 x i32> + ret <4 x float> %res +} + +define <16 x float> @ceil_mask(<16 x float> %x, <16 x float> %y, <16 x i1> %k) nounwind { +; SSE41-LABEL: ceil_mask: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 +; SSE41-NEXT: roundps $10, %xmm3, %xmm9 +; SSE41-NEXT: roundps $10, %xmm2, %xmm2 +; SSE41-NEXT: roundps $10, %xmm1, %xmm1 +; SSE41-NEXT: roundps $10, %xmm0, %xmm3 +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm4 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm5 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,0,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm6 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm0 +; SSE41-NEXT: blendvps %xmm0, %xmm9, %xmm7 +; SSE41-NEXT: movaps %xmm4, %xmm0 +; SSE41-NEXT: movaps %xmm5, %xmm1 +; SSE41-NEXT: movaps %xmm6, %xmm2 +; SSE41-NEXT: movaps %xmm7, %xmm3 +; SSE41-NEXT: retq +; +; AVX-LABEL: ceil_mask: +; AVX: ## %bb.0: +; AVX-NEXT: vroundps $10, %ymm1, %ymm1 +; AVX-NEXT: vroundps $10, %ymm0, %ymm0 +; AVX-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero +; AVX-NEXT: vpslld $31, %xmm6, %xmm6 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; AVX-NEXT: vpslld $31, %xmm5, %xmm5 +; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX-NEXT: vblendvps %ymm5, %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX-NEXT: vpslld $31, %xmm4, %xmm4 +; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX-NEXT: vpslld $31, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX-NEXT: retq +; +; AVX512-LABEL: ceil_mask: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpmovsxbd %xmm2, %zmm2 +; AVX512-NEXT: vpslld $31, %zmm2, %zmm2 +; AVX512-NEXT: vptestmd %zmm2, %zmm2, %k1 +; AVX512-NEXT: vrndscaleps $10, %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovaps %zmm1, %zmm0 +; AVX512-NEXT: retq + %call = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) + %res = select <16 x i1> %k, <16 x float> %call, <16 x float> %y + ret <16 x float> %res +}