Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -3050,6 +3050,13 @@ def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -419,6 +419,9 @@ // Test if in transactional execution. XTEST, + // ERI instructions + RSQRT28, RCP28, EXP2, + // Compare and swap. LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG8_DAG, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -16278,7 +16278,9 @@ /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the /// necessary casting for \p Mask when lowering masking intrinsics. static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, - SDValue PreservedSrc, SelectionDAG &DAG) { + SDValue PreservedSrc, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, VT.getVectorNumElements()); @@ -16305,7 +16307,8 @@ case X86ISD::CMPMU: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); } - + if (PreservedSrc.getOpcode() == ISD::UNDEF) + PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc); } @@ -16357,10 +16360,11 @@ } } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { SDLoc dl(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); - + EVT VT = Op.getValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); if (IntrData) { switch(IntrData->Type) { @@ -16372,6 +16376,16 @@ case INTR_TYPE_3OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case INTR_TYPE_1OP_MASK_RM: { + SDValue Src = Op.getOperand(1); + SDValue Src0 = Op.getOperand(2); + SDValue Mask = Op.getOperand(3); + SDValue RoundingMode = Op.getOperand(4); + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src, + RoundingMode), + Mask, Src0, Subtarget, DAG); + } + case CMP_MASK: case CMP_MASK_CC: { // Comparison intrinsics with masks. @@ -16399,7 +16413,8 @@ Op.getOperand(2)); } SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, - DAG.getTargetConstant(0, MaskVT), DAG); + DAG.getTargetConstant(0, MaskVT), + Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, DAG.getIntPtrConstant(0)); @@ -16566,7 +16581,8 @@ Op.getValueType(), Op.getOperand(2), Op.getOperand(1), Op.getOperand(3)), - Op.getOperand(5), Op.getOperand(4), DAG); + Op.getOperand(5), Op.getOperand(4), + Subtarget, DAG); // ptest and testp intrinsics. The intrinsic these come from are designed to // return an integer value, not just an instruction so lower it to the ptest @@ -16740,7 +16756,8 @@ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)), - Op.getOperand(4), Op.getOperand(1), DAG); + Op.getOperand(4), Op.getOperand(1), + Subtarget, DAG); else return SDValue(); } @@ -18855,7 +18872,7 @@ case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -146,20 +146,21 @@ list Pattern, list MaskingPattern, list ZeroMaskingPattern, + string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512; // Prefer over VMOV*rrk Pat<> let AddedComplexity = 20 in def NAME#k: AVX512, EVEX_K { // In case of the 3src subclass this is overridden with a let. @@ -167,8 +168,8 @@ } let AddedComplexity = 30 in // Prefer over VMOV*rrkz Pat<> def NAME#kz: AVX512, EVEX_KZ; @@ -182,6 +183,7 @@ string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, + string Round = "", string MaskingConstraint = "", InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : @@ -191,7 +193,7 @@ [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, (vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - MaskingConstraint, NoItinerary, IsCommutable>; + Round, MaskingConstraint, NoItinerary, IsCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the instruction. In the masking case, the @@ -199,13 +201,14 @@ multiclass AVX512_maskable O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, InstrItinClass itin = NoItinerary, + dag RHS, string Round = "", + InstrItinClass itin = NoItinerary, bit IsCommutable = 0> : AVX512_maskable_common; // Similar to AVX512_maskable but in this case one of the source operands @@ -232,7 +235,7 @@ AVX512_maskable_custom; // Bitcasts between 512-bit vector types. Return the original type since @@ -2626,7 +2629,7 @@ (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - itins.rr, IsCommutable>, + "", itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V; let mayLoad = 1 in @@ -2635,7 +2638,7 @@ "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), - itins.rm>, + "", itins.rm>, AVX512BIBase, EVEX_4V; } @@ -2651,7 +2654,7 @@ (_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - itins.rm>, + "", itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B; } @@ -4199,44 +4202,44 @@ (COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd -multiclass avx512_fp28_p opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop> { - let hasSideEffects = 0, Predicates = [HasERI] in { - def r : AVX5128I, EVEX; - def rb : AVX5128I, EVEX, EVEX_B; - def m : AVX5128I, EVEX; - } -} -defm VRSQRT28PSZ : avx512_fp28_p<0xCC, "vrsqrt28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRSQRT28PDZ : avx512_fp28_p<0xCC, "vrsqrt28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; -defm VRCP28PSZ : avx512_fp28_p<0xCA, "vrcp28ps", VR512, f512mem>, - EVEX_V512, EVEX_CD8<32, CD8VF>; -defm VRCP28PDZ : avx512_fp28_p<0xCA, "vrcp28pd", VR512, f512mem>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>; - -def : Pat <(v16f32 (int_x86_avx512_rsqrt28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRSQRT28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rsqrt28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRSQRT28PDZrb VR512:$src)>; - -def : Pat <(v16f32 (int_x86_avx512_rcp28_ps (v16f32 VR512:$src), - (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), FROUND_NO_EXC)), - (VRCP28PSZrb VR512:$src)>; -def : Pat <(v8f64 (int_x86_avx512_rcp28_pd (v8f64 VR512:$src), - (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1), FROUND_NO_EXC)), - (VRCP28PDZrb VR512:$src)>; + +multiclass avx512_fp28_p opc, string OpcodeStr, X86VectorVTInfo _, + SDNode OpNode> { + + defm r : AVX512_maskable; + + defm rb : AVX512_maskable, EVEX_B; + + defm m : AVX512_maskable; + + defm mb : AVX512_maskable, EVEX_B; +} + +multiclass avx512_eri opc, string OpcodeStr, SDNode OpNode> { + defm PS : avx512_fp28_p, + EVEX_CD8<32, CD8VF>; + defm PD : avx512_fp28_p, + VEX_W, EVEX_CD8<32, CD8VF>; +} + +let Predicates = [HasERI], hasSideEffects = 0 in { + + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX, EVEX_V512, T8PD; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX, EVEX_V512, T8PD; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX, EVEX_V512, T8PD; +} multiclass avx512_sqrt_packed opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _>{ Index: lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- lib/Target/X86/X86InstrFragmentsSIMD.td +++ lib/Target/X86/X86InstrFragmentsSIMD.td @@ -203,6 +203,8 @@ def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, + SDTCisVec<0>, SDTCisInt<2>]>; def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; @@ -261,6 +263,10 @@ def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>; +def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>; +def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>; +def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>; + def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, SDTCisVT<4, i8>]>; Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -20,7 +20,7 @@ INTR_NO_TYPE, GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, - CMP_MASK, CMP_MASK_CC, VSHIFT, COMI + CMP_MASK, CMP_MASK_CC, VSHIFT, COMI, INTR_TYPE_1OP_MASK_RM }; struct IntrinsicData { @@ -156,6 +156,8 @@ X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), + X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), + X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -204,6 +206,10 @@ X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0), X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0), + X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), + X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0), Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -60,20 +60,6 @@ } declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone -define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { - ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - -define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { - ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] - %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1] - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone - declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) define <8 x double> @test7(<8 x double> %a) { @@ -97,13 +83,6 @@ } declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone -define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) { - ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] - %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1] - ret <16 x float> %res -} -declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone - define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) { ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0] %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -111,13 +90,6 @@ } declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { - ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <4 x float> @test_rcp14_ss(<4 x float> %a0) { ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0] %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1] @@ -125,13 +97,6 @@ } declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -define <4 x float> @test_rcp28_ss(<4 x float> %a0) { - ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] - %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { ; CHECK: vsqrtpd %res = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) ; <<8 x double>> [#uses=1] Index: test/CodeGen/X86/avx512er-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512er-intrinsics.ll +++ test/CodeGen/X86/avx512er-intrinsics.ll @@ -0,0 +1,79 @@ +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s + +define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm1 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8) + ret <16 x float> %res +} + +define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: kmovw + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4) + ret <16 x float> %res +} + +define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) { + ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) + ret <16 x float> %res +} + + +declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) { + ; CHECK: vrcp28ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0] + %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) { + ; CHECK: vrcp28pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0] + %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_exp2_ps_512(<16 x float> %a0) { + ; CHECK: vexp2ps %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0] + %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone + +define <8 x double> @test_exp2_pd_512(<8 x double> %a0) { + ; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0] + %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) { + ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_rcp28_ss(<4 x float> %a0) { + ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0] + %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone +