Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -5398,9 +5398,21 @@ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rsqrt14_pd_128 : GCCBuiltin<"__builtin_ia32_rsqrt14pd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rsqrt14_pd_256 : GCCBuiltin<"__builtin_ia32_rsqrt14pd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rsqrt14_ps_128 : GCCBuiltin<"__builtin_ia32_rsqrt14ps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rsqrt14_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrt14ps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; @@ -5411,9 +5423,21 @@ Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rcp14_pd_128 : GCCBuiltin<"__builtin_ia32_rcp14pd128_mask">, + Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rcp14_pd_256 : GCCBuiltin<"__builtin_ia32_rcp14pd256_mask">, + Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, + llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rcp14_ps_128 : GCCBuiltin<"__builtin_ia32_rcp14ps128_mask">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_rcp14_ps_256 : GCCBuiltin<"__builtin_ia32_rcp14ps256_mask">, + Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, + llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -5712,20 +5712,6 @@ defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; -def : Pat <(v16f32 (int_x86_avx512_rsqrt14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRSQRT14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRSQRT14PDZr VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))), - (VRCP14PSZr VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), - (VRCP14PDZr VR512:$src)>; - /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode> { Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -3446,8 +3446,8 @@ /// sse1_fp_unop_p - SSE1 unops in packed form. multiclass sse1_fp_unop_p opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { -let Predicates = [HasAVX] in { + OpndItins itins, list prds> { +let Predicates = prds in { def V#NAME#PSr : PSI, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>; + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>; + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; // There is no f64 version of the reciprocal approximation instructions. Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -1624,12 +1624,24 @@ X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0), X86_INTRINSIC_DATA(avx512_psll_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSHLDQ, 0), X86_INTRINSIC_DATA(avx512_psrl_dq_512, INTR_TYPE_2OP_IMM8, X86ISD::VSRLDQ, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), - X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -5530,3 +5530,184 @@ %res4 = fadd <4 x double> %res2, %res3 ret <4 x double> %res4 } + +define <8 x float> @test_rsqrt_ps_256_rr(<8 x float> %a0) { +; CHECK-LABEL: test_rsqrt_ps_256_rr: +; CHECK: vrsqrt14ps %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrkz(<8 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_256_rrkz: +; CHECK: vrsqrt14ps %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rsqrt_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_256_rrk: +; CHECK: vrsqrt14ps %ymm0, %ymm1 {%k1} + %res = call <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rr(<4 x float> %a0) { +; CHECK-LABEL: test_rsqrt_ps_128_rr: +; CHECK: vrsqrt14ps %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrkz(<4 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_128_rrkz: +; CHECK: vrsqrt14ps %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rsqrt_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_ps_128_rrk: +; CHECK: vrsqrt14ps %xmm0, %xmm1 {%k1} + %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rsqrt14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rsqrt14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @test_rcp_ps_256_rr(<8 x float> %a0) { +; CHECK-LABEL: test_rcp_ps_256_rr: +; CHECK: vrcp14ps %ymm0, %ymm0 + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrkz(<8 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_256_rrkz: +; CHECK: vrcp14ps %ymm0, %ymm0 {%k1} {z} + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_rcp_ps_256_rrk(<8 x float> %a0, <8 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_256_rrk: +; CHECK: vrcp14ps %ymm0, %ymm1 {%k1} + %res = call <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rr(<4 x float> %a0) { +; CHECK-LABEL: test_rcp_ps_128_rr: +; CHECK: vrcp14ps %xmm0, %xmm0 + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrkz(<4 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_128_rrkz: +; CHECK: vrcp14ps %xmm0, %xmm0 {%k1} {z} + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_rcp_ps_128_rrk(<4 x float> %a0, <4 x float> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_ps_128_rrk: +; CHECK: vrcp14ps %xmm0, %xmm1 {%k1} + %res = call <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.rcp14.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.avx512.rcp14.ps.128(<4 x float>, <4 x float>, i8) nounwind readnone + + +define <4 x double> @test_rsqrt_pd_256_rr(<4 x double> %a0) { +; CHECK-LABEL: test_rsqrt_pd_256_rr: +; CHECK: vrsqrt14pd %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrkz(<4 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_256_rrkz: +; CHECK: vrsqrt14pd %ymm0, %ymm0 {%k1} {z} + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_256_rrk: +; CHECK: vrsqrt14pd %ymm0, %ymm1 {%k1} + %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rr(<2 x double> %a0) { +; CHECK-LABEL: test_rsqrt_pd_128_rr: +; CHECK: vrsqrt14pd %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrkz(<2 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_128_rrkz: +; CHECK: vrsqrt14pd %xmm0, %xmm0 {%k1} {z} + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rsqrt_pd_128_rrk: +; CHECK: vrsqrt14pd %xmm0, %xmm1 {%k1} + %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_rcp_pd_256_rr(<4 x double> %a0) { +; CHECK-LABEL: test_rcp_pd_256_rr: +; CHECK: vrcp14pd %ymm0, %ymm0 + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrkz(<4 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_256_rrkz: +; CHECK: vrcp14pd %ymm0, %ymm0 {%k1} {z} + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_256_rrk: +; CHECK: vrcp14pd %ymm0, %ymm1 {%k1} + %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rr(<2 x double> %a0) { +; CHECK-LABEL: test_rcp_pd_128_rr: +; CHECK: vrcp14pd %xmm0, %xmm0 + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrkz(<2 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_128_rrkz: +; CHECK: vrcp14pd %xmm0, %xmm0 {%k1} {z} + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> zeroinitializer, i8 %mask) + ret <2 x double> %res +} + +define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8 %mask) { +; CHECK-LABEL: test_rcp_pd_128_rrk: +; CHECK: vrcp14pd %xmm0, %xmm1 {%k1} + %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask) + ret <2 x double> %res +} + +declare <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i8) nounwind readnone