diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1144,8 +1144,12 @@ setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v4i32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v4i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); @@ -1153,6 +1157,8 @@ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } @@ -1400,8 +1406,12 @@ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); @@ -1648,8 +1658,14 @@ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i1, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i1, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v4i1, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v4i1, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); @@ -1732,11 +1748,16 @@ setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); + + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); } for (MVT VT : { MVT::v16i16, MVT::v16i32 }) { setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); } @@ -1901,7 +1922,8 @@ if (Subtarget.hasDQI()) { for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, - ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, + ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) setOperationAction(Opc, MVT::v8i64, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1998,6 +2020,8 @@ // v2i64 FP_TO_S/UINT(v2f32) custom conversion. setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } @@ -2026,7 +2050,8 @@ if (Subtarget.hasDQI()) { for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, - ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { + ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT, + ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT}) { setOperationAction(Opc, MVT::v2i64, Custom); setOperationAction(Opc, MVT::v4i64, Custom); } @@ -2169,8 +2194,10 @@ setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v32i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, @@ -2178,12 +2205,16 @@ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, MVT::v32i16); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v32i8, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v32i8, Custom); setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, MVT::v32i16); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v32i1, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v32i1, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); @@ -2209,8 +2240,10 @@ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); @@ -2295,12 +2328,16 @@ if (Subtarget.hasFP16()) { // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); + setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); + setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v4f16, Custom); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); @@ -20567,7 +20604,7 @@ } if (VT.isVector()) { - if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { + if ((VT == MVT::v2i1 || VT == MVT::v2i32) && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; unsigned Opc; @@ -20980,10 +21017,12 @@ EVT SrcVT = Src.getValueType(); EVT DstVT = Node->getValueType(0); EVT TmpVT = DstVT; + EVT SetCCVT = DAG.getTargetLoweringInfo().getSetCCResultType( + DAG.getDataLayout(), *DAG.getContext(), SrcVT); // This code is only for floats and doubles. Fall back to generic code for // anything else. - if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget)) + if (!isScalarFPTypeInSSEReg(SrcVT.getScalarType()) || isSoftF16(SrcVT, Subtarget)) return SDValue(); EVT SatVT = cast(Node->getOperand(1))->getVT(); @@ -20995,14 +21034,16 @@ // Promote result of FP_TO_*INT to at least 32 bits. if (TmpWidth < 32) { - TmpVT = MVT::i32; + TmpVT = + TmpVT.isVector() ? TmpVT.changeVectorElementType(MVT::i32) : MVT::i32; TmpWidth = 32; } // Promote conversions to unsigned 32-bit to 64-bit, because it will allow // us to use a native signed conversion instead. - if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { - TmpVT = MVT::i64; + if (!TmpVT.isVector() && SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { + TmpVT = + TmpVT.isVector() ? TmpVT.changeVectorElementType(MVT::i64) : MVT::i64; TmpWidth = 64; } @@ -21070,8 +21111,8 @@ // Otherwise, select zero if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); + SDValue IsNaN = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::SETUO); + return DAG.getSelect(dl, DstVT, IsNaN, ZeroInt, FpToInt); } SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); @@ -21093,13 +21134,13 @@ if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { // If Src ULT MinFloat, select MinInt. In particular, this also selects // MinInt if Src is NaN. - Select = DAG.getSelectCC( - dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); + SDValue SatMin = DAG.getSetCC(dl, SetCCVT, Src, MinFloatNode, ISD::SETULT); + Select = DAG.getSelect(dl, DstVT, SatMin, MinIntNode, Select); } // If Src OGT MaxFloat, select MaxInt. - Select = DAG.getSelectCC( - dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); + SDValue SatMax = DAG.getSetCC(dl, SetCCVT, Src, MaxFloatNode, ISD::SETOGT); + Select = DAG.getSelect(dl, DstVT, SatMax, MaxIntNode, Select); // In the unsigned case we are done, because we mapped NaN to MinInt, which // is already zero. The promoted case was already handled above. @@ -21109,8 +21150,8 @@ // Otherwise, select 0 if Src is NaN. SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); - return DAG.getSelectCC( - dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); + SDValue IsNaN = DAG.getSetCC(dl, SetCCVT, Src, Src, ISD::SETUO); + return DAG.getSelect(dl, DstVT, IsNaN, ZeroInt, Select); } SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { @@ -32273,15 +32314,20 @@ return; } case ISD::FP_TO_SINT: + case ISD::FP_TO_SINT_SAT: case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_UINT: + case ISD::FP_TO_UINT_SAT: case ISD::STRICT_FP_TO_UINT: { bool IsStrict = N->isStrictFPOpcode(); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT || N->getOpcode() == ISD::STRICT_FP_TO_SINT; + bool IsSaturated = N->getOpcode() == ISD::FP_TO_SINT_SAT || + N->getOpcode() == ISD::FP_TO_UINT_SAT; EVT VT = N->getValueType(0); SDValue Src = N->getOperand(IsStrict ? 1 : 0); SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); + SDValue SatType = IsSaturated ? N->getOperand(1) : SDValue(); EVT SrcVT = Src.getValueType(); SDValue Res; @@ -32293,6 +32339,9 @@ {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other}, {Chain, Src})}); Chain = Res.getValue(1); + } else if (IsSaturated) { + Res = DAG.getNode(N->getOpcode(), dl, VT, + DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src), SatType); } else { Res = DAG.getNode(N->getOpcode(), dl, VT, DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); @@ -32323,6 +32372,8 @@ Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); + } else if (IsSaturated) { + Res = DAG.getNode(N->getOpcode(), dl, ResVT, Src, SatType); } else { unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; Res = DAG.getNode(Opc, dl, ResVT, Src); @@ -32362,7 +32413,9 @@ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); - } else + } else if (IsSaturated) + Res = DAG.getNode(N->getOpcode(), dl, PromoteVT, Src, SatType); + else Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); // Preserve what we know about the size of the original result. If the @@ -32404,6 +32457,8 @@ "Unexpected type action!"); if (Src.getValueType() == MVT::v2f64) { if (!IsSigned && !Subtarget.hasAVX512()) { + if (IsSaturated) + return; SDValue Res = expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget); Results.push_back(Res); @@ -32413,6 +32468,8 @@ unsigned Opc; if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else if (IsSaturated) + Opc = N->getOpcode(); else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; @@ -32435,6 +32492,10 @@ Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, {N->getOperand(0), Src}); Chain = Res.getValue(1); + } else if (IsSaturated) { + Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, + DAG.getConstantFP(0.0, dl, MVT::v2f64)); + Res = DAG.getNode(Opc, dl, MVT::v4i32, Src, SatType); } else { Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); } @@ -32477,6 +32538,8 @@ if (NumElts != SrcElts) { if (IsStrict) Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; + else if (IsSaturated) + Opc = N->getOpcode(); else Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; } @@ -32498,7 +32561,7 @@ Results.push_back(Chain); return; } - + assert(!IsSaturated); if (VT == MVT::i128 && Subtarget.isTargetWin64()) { SDValue Chain; SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain); diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512dq,+avx512fp16 | FileCheck %s -check-prefix=AVX512 ; i32 saturate @@ -41,6 +42,12 @@ ; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> %0 = icmp slt <2 x i64> %conv, @@ -87,6 +94,12 @@ ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> %0 = icmp ult <2 x i64> %conv, @@ -131,6 +144,14 @@ ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> %0 = icmp slt <2 x i64> %conv, @@ -144,78 +165,23 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm4 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm4, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm7, %xmm8 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm9 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; CHECK-NEXT: por %xmm9, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm1 -; CHECK-NEXT: por %xmm4, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm4 -; CHECK-NEXT: pxor %xmm0, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm4, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm3, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: movdqa %xmm5, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm5 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm5, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm2, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm1 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: andnps %xmm2, %xmm3 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: orps %xmm3, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -296,6 +262,13 @@ ; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -373,6 +346,15 @@ ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -388,98 +370,59 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $16, %xmm1 ; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm3 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: orps %xmm2, %xmm3 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm3, %xmm8 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm5, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: pand %xmm1, %xmm8 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: movdqa %xmm4, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm6 -; CHECK-NEXT: pcmpeqd %xmm6, %xmm5 -; CHECK-NEXT: movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm3 -; CHECK-NEXT: pand %xmm3, %xmm4 -; CHECK-NEXT: pandn %xmm2, %xmm3 -; CHECK-NEXT: por %xmm4, %xmm3 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm6, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm7, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm2, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-NEXT: movaps {{.*#+}} xmm2 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm2 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm0 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: andnps %xmm0, %xmm1 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: cmpunordps %xmm3, %xmm3 +; CHECK-NEXT: andnps %xmm2, %xmm3 +; CHECK-NEXT: unpcklpd (%rsp), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -579,6 +522,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2uqq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <4 x half> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -678,6 +628,15 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i32: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -708,6 +667,14 @@ ; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -737,6 +704,13 @@ ; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i32> %0 = icmp ult <2 x i32> %conv, @@ -761,6 +735,15 @@ ; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -777,6 +760,12 @@ ; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: packssdw %xmm0, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -808,6 +797,12 @@ ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i32> %0 = icmp ult <4 x i32> %conv, @@ -834,6 +829,12 @@ ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -899,6 +900,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <8 x half> %x to <8 x i32> %0 = icmp slt <8 x i32> %conv, @@ -1008,6 +1016,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2udq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <8 x half> %x to <8 x i32> %0 = icmp ult <8 x i32> %conv, @@ -1094,6 +1109,15 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i16: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <8 x half> %x to <8 x i32> %0 = icmp slt <8 x i32> %conv, @@ -1155,6 +1179,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcmpgtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordpd %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1198,6 +1232,39 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i128> %0 = icmp ult <2 x i128> %conv, @@ -1252,6 +1319,52 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: movq %rbx, %rdi +; AVX512-NEXT: negq %rdi +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: sbbq %rsi, %rdi +; AVX512-NEXT: cmovgeq %rcx, %rbx +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: negq %rsi +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: sbbq %rdx, %rsi +; AVX512-NEXT: cmovgeq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1311,6 +1424,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %xmm1 +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x float> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1354,6 +1477,39 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x float> %x to <2 x i128> %0 = icmp ult <2 x i128> %conv, @@ -1408,6 +1564,52 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: movq %rbx, %rdi +; AVX512-NEXT: negq %rdi +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: sbbq %rsi, %rdi +; AVX512-NEXT: cmovgeq %rcx, %rbx +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: negq %rsi +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: sbbq %rdx, %rsi +; AVX512-NEXT: cmovgeq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x float> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1467,6 +1669,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %xmm1 +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordph %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x half> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1511,6 +1723,38 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunshfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: callq __fixunshfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x half> %x to <2 x i128> %0 = icmp ult <2 x i128> %conv, @@ -1565,6 +1809,52 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i64: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: callq __fixhfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixhfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: movq %rbx, %rdi +; AVX512-NEXT: negq %rdi +; AVX512-NEXT: movl $0, %edi +; AVX512-NEXT: sbbq %rsi, %rdi +; AVX512-NEXT: cmovgeq %rcx, %rbx +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: negq %rsi +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: sbbq %rdx, %rsi +; AVX512-NEXT: cmovgeq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x half> %x to <2 x i128> %0 = icmp slt <2 x i128> %conv, @@ -1617,6 +1907,12 @@ ; CHECK-NEXT: por %xmm3, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpmovsqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1661,6 +1957,12 @@ ; CHECK-NEXT: por %xmm1, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm0 +; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1704,6 +2006,14 @@ ; CHECK-NEXT: pand %xmm3, %xmm1 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovusqd %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i64> %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> ) @@ -1715,78 +2025,23 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %rax -; CHECK-NEXT: movq %rax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm2 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm2, %rax -; CHECK-NEXT: movq %rax, %xmm2 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm3, %xmm1 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm6, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm8 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3] -; CHECK-NEXT: por %xmm8, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm3 -; CHECK-NEXT: pandn %xmm4, %xmm1 -; CHECK-NEXT: por %xmm3, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm3 -; CHECK-NEXT: pxor %xmm0, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm7 -; CHECK-NEXT: pcmpgtd %xmm3, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; CHECK-NEXT: pand %xmm7, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm5 -; CHECK-NEXT: pand %xmm5, %xmm2 -; CHECK-NEXT: pandn %xmm4, %xmm5 -; CHECK-NEXT: por %xmm2, %xmm5 -; CHECK-NEXT: movdqa %xmm5, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm4 -; CHECK-NEXT: pcmpeqd %xmm4, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm5 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm5, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm4, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm1 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: andnps %xmm2, %xmm3 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: orps %xmm3, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -1865,6 +2120,13 @@ ; CHECK-NEXT: por %xmm4, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2uqq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -1941,6 +2203,15 @@ ; CHECK-NEXT: pand %xmm1, %xmm0 ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -1954,98 +2225,59 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 80 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: psrld $16, %xmm1 ; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa %xmm0, %xmm1 ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm3 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: orps %xmm2, %xmm3 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm3, %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: movq %rax, %xmm0 -; CHECK-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [2147483648,2147483648] -; CHECK-NEXT: movdqa %xmm2, %xmm1 -; CHECK-NEXT: movdqa %xmm2, %xmm7 -; CHECK-NEXT: pxor %xmm0, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; CHECK-NEXT: pxor %xmm3, %xmm3 -; CHECK-NEXT: pcmpeqd %xmm3, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; CHECK-NEXT: movdqa %xmm4, %xmm5 -; CHECK-NEXT: pcmpgtd %xmm1, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; CHECK-NEXT: pand %xmm2, %xmm6 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; CHECK-NEXT: por %xmm6, %xmm1 -; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; CHECK-NEXT: pand %xmm1, %xmm7 -; CHECK-NEXT: pandn %xmm2, %xmm1 -; CHECK-NEXT: por %xmm7, %xmm1 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; CHECK-NEXT: movdqa %xmm7, %xmm5 -; CHECK-NEXT: pxor %xmm0, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm3, %xmm6 -; CHECK-NEXT: pcmpgtd %xmm5, %xmm4 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; CHECK-NEXT: pand %xmm6, %xmm3 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm7, %xmm3 -; CHECK-NEXT: pand %xmm4, %xmm3 -; CHECK-NEXT: pandn %xmm2, %xmm4 -; CHECK-NEXT: por %xmm3, %xmm4 -; CHECK-NEXT: movdqa %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm0, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm5 -; CHECK-NEXT: pcmpeqd %xmm5, %xmm3 -; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; CHECK-NEXT: pcmpgtd %xmm6, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2] -; CHECK-NEXT: pand %xmm3, %xmm7 -; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; CHECK-NEXT: por %xmm7, %xmm2 -; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968] -; CHECK-NEXT: pand %xmm2, %xmm4 -; CHECK-NEXT: pandn %xmm3, %xmm2 -; CHECK-NEXT: por %xmm4, %xmm2 -; CHECK-NEXT: pxor %xmm1, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; CHECK-NEXT: pcmpeqd %xmm5, %xmm4 -; CHECK-NEXT: pcmpgtd %xmm6, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2] -; CHECK-NEXT: pand %xmm4, %xmm5 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: por %xmm5, %xmm0 -; CHECK-NEXT: pand %xmm0, %xmm1 -; CHECK-NEXT: pandn %xmm3, %xmm0 -; CHECK-NEXT: por %xmm1, %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-NEXT: movaps {{.*#+}} xmm2 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm2 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm0 +; CHECK-NEXT: movaps %xmm2, %xmm1 +; CHECK-NEXT: andnps %xmm0, %xmm1 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: orps %xmm1, %xmm2 +; CHECK-NEXT: cmpunordps %xmm3, %xmm3 +; CHECK-NEXT: andnps %xmm2, %xmm3 +; CHECK-NEXT: unpcklpd (%rsp), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] +; CHECK-NEXT: movaps %xmm3, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2143,6 +2375,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2uqq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2241,6 +2480,15 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i32_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2269,6 +2517,14 @@ ; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -2296,6 +2552,13 @@ ; CHECK-NEXT: por %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -2319,6 +2582,15 @@ ; CHECK-NEXT: pand %xmm2, %xmm1 ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -2333,6 +2605,12 @@ ; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: packssdw %xmm0, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2362,6 +2640,12 @@ ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2udq %xmm0, %xmm0 +; AVX512-NEXT: vpmovusdw %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptoui <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2387,6 +2671,12 @@ ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2450,6 +2740,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpmovsdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -2557,6 +2854,13 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2udq %xmm0, %ymm0 +; AVX512-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptoui <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -2642,6 +2946,15 @@ ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i16_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpmovusdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq entry: %conv = fptosi <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -2701,6 +3014,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f64i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcmpgtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordpd %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2742,6 +3065,39 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f64i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x double> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2789,6 +3145,46 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f64i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovsq %rcx, %rbx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovsq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2846,6 +3242,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f32i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttps2qq %xmm0, %xmm1 +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x float> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2887,6 +3293,39 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utest_f32i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x float> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2934,6 +3373,46 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f32i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovsq %rcx, %rbx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovsq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x float> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -2991,6 +3470,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: stest_f16i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vcvttph2qq %xmm0, %xmm1 +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordph %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq entry: %conv = fptosi <2 x half> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -3033,6 +3522,38 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: utesth_f16i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: callq __fixunshfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: callq __fixunshfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovneq %rcx, %rax +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovneq %rcx, %rbx +; AVX512-NEXT: vmovq %rbx, %xmm0 +; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptoui <2 x half> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> ) @@ -3080,6 +3601,46 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq +; +; AVX512-LABEL: ustest_f16i64_mm: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: subq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 48 +; AVX512-NEXT: .cfi_offset %rbx, -24 +; AVX512-NEXT: .cfi_offset %r14, -16 +; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512-NEXT: callq __fixhfti@PLT +; AVX512-NEXT: movq %rax, %rbx +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixhfti@PLT +; AVX512-NEXT: xorl %ecx, %ecx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovgq %rcx, %rax +; AVX512-NEXT: movl $1, %esi +; AVX512-NEXT: cmovgq %rsi, %rdx +; AVX512-NEXT: testq %r14, %r14 +; AVX512-NEXT: cmovgq %rcx, %rbx +; AVX512-NEXT: cmovleq %r14, %rsi +; AVX512-NEXT: testq %rsi, %rsi +; AVX512-NEXT: cmovsq %rcx, %rbx +; AVX512-NEXT: testq %rdx, %rdx +; AVX512-NEXT: cmovsq %rcx, %rax +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rbx, %xmm1 +; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512-NEXT: addq $24, %rsp +; AVX512-NEXT: .cfi_def_cfa_offset 24 +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: .cfi_def_cfa_offset 16 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: .cfi_def_cfa_offset 8 +; AVX512-NEXT: retq entry: %conv = fptosi <2 x half> %x to <2 x i128> %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> ) diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512dq,+avx512fp16 | FileCheck %s --check-prefixes=AVX512 ; ; 32-bit float to signed integer @@ -16,43 +17,25 @@ ; CHECK-LABEL: test_signed_v4i1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: ucomiss %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %ecx -; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: ucomiss %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %ecx -; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %ecx -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: maxss %xmm2, %xmm0 -; CHECK-NEXT: minss %xmm3, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %ecx -; CHECK-NEXT: cmovpl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: xorps %xmm2, %xmm2 +; CHECK-NEXT: minps %xmm1, %xmm2 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i1_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vminps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> %f) ret <4 x i1> %x } @@ -60,42 +43,26 @@ define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i8_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm1, %xmm3 -; CHECK-NEXT: maxss %xmm0, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: movaps %xmm1, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: orl %ecx, %eax -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: minss %xmm1, %xmm2 -; CHECK-NEXT: cvttss2si %xmm2, %ecx -; CHECK-NEXT: shll $24, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm1 +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm1 +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i8_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmaxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; AVX512-NEXT: vminps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> %f) ret <4 x i8> %x } @@ -104,36 +71,25 @@ ; CHECK-LABEL: test_signed_v4i16_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: maxss %xmm1, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm4, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %eax -; CHECK-NEXT: movaps %xmm2, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm1, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: pinsrw $1, %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: movaps %xmm2, %xmm5 -; CHECK-NEXT: maxss %xmm3, %xmm5 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm5, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax -; CHECK-NEXT: pinsrw $2, %eax, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: maxss %xmm0, %xmm2 -; CHECK-NEXT: minss %xmm2, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: pinsrw $3, %eax, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cmpunordps %xmm0, %xmm1 +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm1 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i16_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmaxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 +; AVX512-NEXT: vminps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> %f) ret <4 x i16> %x } @@ -141,43 +97,26 @@ define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_signed_v4i32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %edx -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss %xmm2, %xmm1 -; CHECK-NEXT: movl $2147483647, %eax # imm = 0x7FFFFFFF -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: xorl %ecx, %ecx -; CHECK-NEXT: ucomiss %xmm1, %xmm1 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm3, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm3 -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm3 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm3 -; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %edx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmoval %eax, %edx -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm1 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm2 +; CHECK-NEXT: movaps %xmm1, %xmm3 +; CHECK-NEXT: andnps %xmm2, %xmm3 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: orps %xmm3, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %x = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> %f) ret <4 x i32> %x } @@ -221,6 +160,16 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i64_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vcvttps2qq %xmm0, %ymm1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1} +; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} +; AVX512-NEXT: retq %x = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> %f) ret <4 x i64> %x } @@ -329,6 +278,110 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v4i128_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: xorl %r14d, %r14d +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r14, %rax +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %r15 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rbp, %r15 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r14, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r14, %r15 +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: movq %rdx, %r13 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r14, %r12 +; AVX512-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rax, %r13 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %rbp, %r13 +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovaq %rax, %r12 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r14, %r12 +; AVX512-NEXT: cmovpq %r14, %r13 +; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: cmovbq %rax, %rbp +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %r14 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rcx, %r14 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rbp +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %rax, %rbp +; AVX512-NEXT: cmovpq %rax, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: cmovbq %rsi, %rax +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %rsi, %rax +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: cmovpq %rcx, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r14, 56(%rbx) +; AVX512-NEXT: movq %rbp, 48(%rbx) +; AVX512-NEXT: movq %r13, 40(%rbx) +; AVX512-NEXT: movq %r12, 32(%rbx) +; AVX512-NEXT: movq %r15, 24(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = call <4 x i128> @llvm.fptosi.sat.v4i128.v4f32(<4 x float> %f) ret <4 x i128> %x } @@ -367,6 +420,19 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i1_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [-1.0E+0,-1.0E+0] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: retq %x = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x } @@ -374,22 +440,40 @@ define <2 x i8> @test_signed_v2i8_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_signed_v2i8_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movapd %xmm1, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: maxsd %xmm2, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: movapd %xmm3, %xmm4 -; CHECK-NEXT: minsd %xmm2, %xmm4 -; CHECK-NEXT: cvttsd2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: minsd %xmm3, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: minsd %xmm1, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %ecx -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: maxsd %xmm2, %xmm0 +; CHECK-NEXT: minsd %xmm3, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %eax +; CHECK-NEXT: cmovpl %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i8_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512-NEXT: vcmpunordpd %ymm0, %ymm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> %f) ret <2 x i8> %x } @@ -398,19 +482,37 @@ ; CHECK-LABEL: test_signed_v2i16_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movapd %xmm1, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: maxsd %xmm1, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %eax +; CHECK-NEXT: xorl %ecx, %ecx +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: cmovpl %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm2 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pinsrw $1, %eax, %xmm0 +; CHECK-NEXT: ucomisd %xmm0, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm0 +; CHECK-NEXT: minsd %xmm3, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %eax +; CHECK-NEXT: cmovpl %ecx, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i16_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512-NEXT: vcmpunordpd %ymm0, %ymm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> %f) ret <2 x i16> %x } @@ -438,6 +540,17 @@ ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i32_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vmaxpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512-NEXT: vcmpunordpd %ymm0, %ymm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> %f) ret <2 x i32> %x } @@ -464,6 +577,16 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpgtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 +; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpunordpd %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: retq %x = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> %f) ret <2 x i64> %x } @@ -525,6 +648,63 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v2i128_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $40, %rsp +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX512-NEXT: cmovbq %r12, %r14 +; AVX512-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rax, %r15 +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX512-NEXT: movabsq $9223372036854775807, %rbp # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rbp, %r15 +; AVX512-NEXT: movq $-1, %r13 +; AVX512-NEXT: cmovaq %r13, %r14 +; AVX512-NEXT: vucomisd %xmm1, %xmm1 +; AVX512-NEXT: cmovpq %r12, %r14 +; AVX512-NEXT: cmovpq %r12, %r15 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixdfti@PLT +; AVX512-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %rbp, %rdx +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: vucomisd %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: cmovpq %r12, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r15, 24(%rbx) +; AVX512-NEXT: movq %r14, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $40, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = call <2 x i128> @llvm.fptosi.sat.v2i128.v2f64(<2 x double> %f) ret <2 x i128> %x } @@ -661,6 +841,19 @@ ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i1_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; AVX512-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vminph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-NEXT: vpmovm2w %k0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> %f) ret <8 x i1> %x } @@ -668,123 +861,81 @@ define <8 x i8> @test_signed_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_signed_v8i8_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r13 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %r12d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $128, %ebx -; CHECK-NEXT: cmovbl %ebx, %r12d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $127, %ebp -; CHECK-NEXT: cmoval %ebp, %r12d -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %r12d -; CHECK-NEXT: shll $8, %r12d -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movzbl %al, %r15d -; CHECK-NEXT: orl %r12d, %r15d +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movzbl %al, %r12d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movzbl %al, %r13d -; CHECK-NEXT: shll $8, %r13d -; CHECK-NEXT: orl %r12d, %r13d +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movzbl %al, %r12d -; CHECK-NEXT: shll $16, %r12d -; CHECK-NEXT: orl %r13d, %r12d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: cmpunordps %xmm0, %xmm1 +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm1 +; CHECK-NEXT: pslld $16, %xmm1 +; CHECK-NEXT: psrad $16, %xmm1 +; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %r12d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrw $2, %r15d, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %r15d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %r15d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %r15d -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %r15d -; CHECK-NEXT: shll $8, %r15d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %r15d, %eax -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $40, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r13 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cmpunordps %xmm1, %xmm0 +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: cvttps2dq %xmm1, %xmm1 +; CHECK-NEXT: andnps %xmm1, %xmm0 +; CHECK-NEXT: pslld $16, %xmm0 +; CHECK-NEXT: psrad $16, %xmm0 +; CHECK-NEXT: packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: packuswb %xmm0, %xmm0 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i8_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-1.28E+2,-1.28E+2,-1.28E+2,-1.28E+2,-1.28E+2,-1.28E+2,-1.28E+2,-1.28E+2] +; AVX512-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1.27E+2,1.27E+2,1.27E+2,1.27E+2,1.27E+2,1.27E+2,1.27E+2,1.27E+2] +; AVX512-NEXT: vminph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> %f) ret <8 x i8> %x } @@ -912,6 +1063,18 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i16_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpngeph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 +; AVX512-NEXT: vmovdqu16 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vmovdqu16 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> %f) ret <8 x i16> %x } @@ -919,125 +1082,101 @@ define <8 x i32> @test_signed_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_signed_v8i32_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $-2147483648, %ebx # imm = 0x80000000 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $2147483647, %ebp # imm = 0x7FFFFFFF -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: xorl %r14d, %r14d -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm0 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: cmpunordps %xmm3, %xmm3 +; CHECK-NEXT: andnps %xmm0, %xmm3 +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm0 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: cmpunordps %xmm3, %xmm3 +; CHECK-NEXT: andnps %xmm0, %xmm3 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0] +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload +; CHECK-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1] +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm0 +; CHECK-NEXT: cvttps2dq %xmm3, %xmm1 +; CHECK-NEXT: movaps %xmm0, %xmm2 +; CHECK-NEXT: andnps %xmm1, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: orps %xmm2, %xmm0 +; CHECK-NEXT: cmpunordps %xmm3, %xmm3 +; CHECK-NEXT: andnps %xmm0, %xmm3 +; CHECK-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: ucomiss %xmm0, %xmm0 -; CHECK-NEXT: cmovpl %r14d, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: movaps %xmm0, %xmm1 +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [2.14748352E+9,2.14748352E+9,2.14748352E+9,2.14748352E+9] +; CHECK-NEXT: cmpltps %xmm1, %xmm3 +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 +; CHECK-NEXT: movaps %xmm3, %xmm2 +; CHECK-NEXT: andnps %xmm0, %xmm2 +; CHECK-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; CHECK-NEXT: orps %xmm2, %xmm3 +; CHECK-NEXT: cmpunordps %xmm1, %xmm1 +; CHECK-NEXT: andnps %xmm3, %xmm1 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: addq $64, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm1 +; AVX512-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1 {%k1} +; AVX512-NEXT: vcmpunordph %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} {z} +; AVX512-NEXT: retq %x = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> %f) ret <8 x i32> %x } @@ -1162,6 +1301,16 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i64_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vcvttph2qq %xmm0, %zmm1 +; AVX512-NEXT: vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1 {%k1} +; AVX512-NEXT: vcmpunordph %xmm0, %xmm0, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} +; AVX512-NEXT: retq %x = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> %f) ret <8 x i64> %x } @@ -1376,6 +1525,215 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_signed_v8i128_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $104, %rsp +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: movq %rcx, %r14 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: movq %rcx, %r15 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: movq $-1, %r13 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r12, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: cmovbq %r14, %rdx +; AVX512-NEXT: movq %r14, %rbp +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r15, %rdx +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq $-1, %r14 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r12, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: cmovbq %rbp, %rdx +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r15, %rdx +; AVX512-NEXT: cmovaq %r14, %rax +; AVX512-NEXT: movq $-1, %r14 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r12, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: cmovbq %rbp, %rdx +; AVX512-NEXT: movq %rbp, %r13 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r15, %rdx +; AVX512-NEXT: cmovaq %r14, %rax +; AVX512-NEXT: movq $-1, %r14 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r12, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rdx, %rbp +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: cmovbq %r13, %rbp +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r15, %rbp +; AVX512-NEXT: movq %r15, %r13 +; AVX512-NEXT: cmovaq %r14, %rax +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovpq %r12, %rbp +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %r14 +; AVX512-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rax, %r15 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %r15 +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovaq %rax, %r14 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %r12, %r14 +; AVX512-NEXT: cmovpq %r12, %r15 +; AVX512-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: movq %rdx, %r13 +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: cmovbq %rax, %r12 +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %r13 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rcx, %r13 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %r12 +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %rax, %r12 +; AVX512-NEXT: cmovpq %rax, %r13 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixsfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: movl $0, %esi +; AVX512-NEXT: cmovbq %rsi, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: vucomiss %xmm0, %xmm0 +; AVX512-NEXT: cmovpq %rsi, %rax +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: cmovpq %rcx, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r13, 120(%rbx) +; AVX512-NEXT: movq %r12, 112(%rbx) +; AVX512-NEXT: movq %r15, 104(%rbx) +; AVX512-NEXT: movq %r14, 96(%rbx) +; AVX512-NEXT: movq %rbp, 88(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 80(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 72(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 64(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 56(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 48(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 40(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 32(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 24(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $104, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = call <8 x i128> @llvm.fptosi.sat.v8i128.v8f16(<8 x half> %f) ret <8 x i128> %x } diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll --- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl,+avx512dq,+avx512fp16 | FileCheck %s --check-prefixes=AVX512 ; ; 32-bit float to unsigned integer @@ -15,35 +16,23 @@ define <4 x i1> @test_unsigned_v4i1_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_unsigned_v4i1_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: maxss %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: movd %eax, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: maxss %xmm2, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: maxss %xmm2, %xmm0 -; CHECK-NEXT: minss %xmm3, %xmm0 -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i1_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vminps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: vpmovm2d %k0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> %f) ret <4 x i1> %x } @@ -52,41 +41,21 @@ ; CHECK-LABEL: test_unsigned_v4i8_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: maxss %xmm0, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm2, %xmm4 -; CHECK-NEXT: minss %xmm3, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1] -; CHECK-NEXT: xorps %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx -; CHECK-NEXT: movzbl %cl, %ecx -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: xorps %xmm4, %xmm4 -; CHECK-NEXT: maxss %xmm3, %xmm4 -; CHECK-NEXT: movaps %xmm2, %xmm3 -; CHECK-NEXT: minss %xmm4, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: shll $16, %eax -; CHECK-NEXT: orl %ecx, %eax -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: minss %xmm1, %xmm2 -; CHECK-NEXT: cvttss2si %xmm2, %ecx -; CHECK-NEXT: shll $24, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: packuswb %xmm0, %xmm0 +; CHECK-NEXT: packuswb %xmm0, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i8_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> %f) ret <4 x i8> %x } @@ -94,37 +63,23 @@ define <4 x i16> @test_unsigned_v4i16_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_unsigned_v4i16_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1] -; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: xorps %xmm3, %xmm3 -; CHECK-NEXT: maxss %xmm1, %xmm3 -; CHECK-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; CHECK-NEXT: movaps %xmm4, %xmm1 -; CHECK-NEXT: minss %xmm3, %xmm1 -; CHECK-NEXT: cvttss2si %xmm1, %eax ; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: maxss %xmm0, %xmm1 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm1, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %ecx -; CHECK-NEXT: movd %ecx, %xmm1 -; CHECK-NEXT: pinsrw $1, %eax, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm3 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] -; CHECK-NEXT: xorps %xmm5, %xmm5 -; CHECK-NEXT: maxss %xmm3, %xmm5 -; CHECK-NEXT: movaps %xmm4, %xmm3 -; CHECK-NEXT: minss %xmm5, %xmm3 -; CHECK-NEXT: cvttss2si %xmm3, %eax -; CHECK-NEXT: pinsrw $2, %eax, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: maxss %xmm0, %xmm2 -; CHECK-NEXT: minss %xmm2, %xmm4 -; CHECK-NEXT: cvttss2si %xmm4, %eax -; CHECK-NEXT: pinsrw $3, %eax, %xmm1 -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i16_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vminps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> %f) ret <4 x i16> %x } @@ -132,44 +87,33 @@ define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind { ; CHECK-LABEL: test_unsigned_v4i32_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: movaps %xmm0, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3] -; CHECK-NEXT: cvttss2si %xmm1, %rdx -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorps %xmm2, %xmm2 -; CHECK-NEXT: ucomiss %xmm2, %xmm1 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: ucomiss %xmm3, %xmm1 -; CHECK-NEXT: movl $-1, %ecx -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: movaps %xmm0, %xmm4 -; CHECK-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1] -; CHECK-NEXT: cvttss2si %xmm4, %rdx -; CHECK-NEXT: ucomiss %xmm2, %xmm4 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm4 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm4 -; CHECK-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm1 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: cvttss2si %xmm0, %rdx -; CHECK-NEXT: ucomiss %xmm2, %xmm0 -; CHECK-NEXT: cmovbl %eax, %edx -; CHECK-NEXT: ucomiss %xmm3, %xmm0 -; CHECK-NEXT: cmoval %ecx, %edx -; CHECK-NEXT: movd %edx, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; CHECK-NEXT: movdqa %xmm1, %xmm0 +; CHECK-NEXT: cmpnleps %xmm0, %xmm2 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm3 +; CHECK-NEXT: movdqa %xmm3, %xmm4 +; CHECK-NEXT: psrad $31, %xmm4 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [4.29496704E+9,4.29496704E+9,4.29496704E+9,4.29496704E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm1 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm4, %xmm0 +; CHECK-NEXT: por %xmm3, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm2 +; CHECK-NEXT: orps %xmm2, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i32_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnleps %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: retq %x = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> %f) ret <4 x i32> %x } @@ -243,6 +187,18 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i64_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnleps %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vcmpgtps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512-NEXT: retq %x = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f) ret <4 x i64> %x } @@ -331,6 +287,90 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v4i128_v4f32: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $56, %rsp +; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: xorl %r14d, %r14d +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vucomiss %xmm0, %xmm1 +; AVX512-NEXT: cmovbq %r14, %r15 +; AVX512-NEXT: cmovbq %r14, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX512-NEXT: movq $-1, %rbp +; AVX512-NEXT: cmovaq %rbp, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %rbp, %r15 +; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: movq %rdx, %r13 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r14, %r13 +; AVX512-NEXT: cmovbq %r14, %r12 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %rbp, %r12 +; AVX512-NEXT: cmovaq %rbp, %r13 +; AVX512-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: movq %rdx, %r14 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: cmovbq %rax, %r14 +; AVX512-NEXT: cmovbq %rax, %rbp +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovaq %rax, %rbp +; AVX512-NEXT: cmovaq %rax, %r14 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: cmovbq %rcx, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r14, 56(%rbx) +; AVX512-NEXT: movq %rbp, 48(%rbx) +; AVX512-NEXT: movq %r13, 40(%rbx) +; AVX512-NEXT: movq %r12, 32(%rbx) +; AVX512-NEXT: movq %r15, 24(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $56, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = call <4 x i128> @llvm.fptoui.sat.v4i128.v4f32(<4 x float> %f) ret <4 x i128> %x } @@ -364,6 +404,19 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i1_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vminpd %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpmovd2m %xmm0, %k0 +; AVX512-NEXT: vpmovm2q %k0, %xmm0 +; AVX512-NEXT: retq %x = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> %f) ret <2 x i1> %x } @@ -371,22 +424,34 @@ define <2 x i8> @test_unsigned_v2i8_v2f64(<2 x double> %f) nounwind { ; CHECK-LABEL: test_unsigned_v2i8_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: xorpd %xmm1, %xmm1 ; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm2 +; CHECK-NEXT: movapd %xmm0, %xmm1 +; CHECK-NEXT: maxsd %xmm2, %xmm1 ; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; CHECK-NEXT: movapd %xmm3, %xmm4 -; CHECK-NEXT: minsd %xmm2, %xmm4 -; CHECK-NEXT: cvttsd2si %xmm4, %eax -; CHECK-NEXT: movzbl %al, %eax +; CHECK-NEXT: minsd %xmm3, %xmm1 +; CHECK-NEXT: cvttsd2si %xmm1, %eax +; CHECK-NEXT: movd %eax, %xmm1 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: maxsd %xmm0, %xmm1 -; CHECK-NEXT: minsd %xmm1, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %ecx -; CHECK-NEXT: shll $8, %ecx -; CHECK-NEXT: orl %eax, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 +; CHECK-NEXT: maxsd %xmm2, %xmm0 +; CHECK-NEXT: minsd %xmm3, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: packuswb %xmm1, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i8_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512-NEXT: vpmovdb %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> %f) ret <2 x i8> %x } @@ -395,19 +460,31 @@ ; CHECK-LABEL: test_unsigned_v2i16_v2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: xorpd %xmm1, %xmm1 -; CHECK-NEXT: maxsd %xmm0, %xmm1 +; CHECK-NEXT: movapd %xmm0, %xmm2 +; CHECK-NEXT: maxsd %xmm1, %xmm2 +; CHECK-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; CHECK-NEXT: minsd %xmm3, %xmm2 +; CHECK-NEXT: cvttsd2si %xmm2, %eax +; CHECK-NEXT: movd %eax, %xmm2 ; CHECK-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: xorpd %xmm2, %xmm2 -; CHECK-NEXT: maxsd %xmm0, %xmm2 -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movapd %xmm0, %xmm3 -; CHECK-NEXT: minsd %xmm2, %xmm3 -; CHECK-NEXT: cvttsd2si %xmm3, %eax -; CHECK-NEXT: minsd %xmm1, %xmm0 -; CHECK-NEXT: cvttsd2si %xmm0, %ecx -; CHECK-NEXT: movd %ecx, %xmm0 -; CHECK-NEXT: pinsrw $1, %eax, %xmm0 +; CHECK-NEXT: maxsd %xmm1, %xmm0 +; CHECK-NEXT: minsd %xmm3, %xmm0 +; CHECK-NEXT: cvttsd2si %xmm0, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i16_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> %f) ret <2 x i16> %x } @@ -431,6 +508,16 @@ ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i32_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovapd %xmm0, %xmm0 +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0 +; AVX512-NEXT: vcvttpd2udq %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> %f) ret <2 x i32> %x } @@ -473,6 +560,18 @@ ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; CHECK-NEXT: movdqa %xmm1, %xmm0 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i64_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnlepd %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotw %k0, %k1 +; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} {z} +; AVX512-NEXT: vcmpgtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: retq %x = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> %f) ret <2 x i64> %x } @@ -524,6 +623,53 @@ ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v2i128_v2f64: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $32, %rsp +; AVX512-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX512-NEXT: vucomisd %xmm0, %xmm1 +; AVX512-NEXT: cmovbq %r12, %r15 +; AVX512-NEXT: cmovbq %r12, %r14 +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX512-NEXT: movq $-1, %r13 +; AVX512-NEXT: cmovaq %r13, %r14 +; AVX512-NEXT: cmovaq %r13, %r15 +; AVX512-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: callq __fixunsdfti@PLT +; AVX512-NEXT: vmovapd (%rsp), %xmm0 # 16-byte Reload +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rdx +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomisd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: cmovaq %r13, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r15, 24(%rbx) +; AVX512-NEXT: movq %r14, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $32, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: retq %x = call <2 x i128> @llvm.fptoui.sat.v2i128.v2f64(<2 x double> %f) ret <2 x i128> %x } @@ -644,6 +790,19 @@ ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i1_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX512-NEXT: vminph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512-NEXT: vpmovd2m %ymm0, %k0 +; AVX512-NEXT: vpmovm2w %k0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> %f) ret <8 x i1> %x } @@ -651,105 +810,72 @@ define <8 x i8> @test_unsigned_v8i8_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_unsigned_v8i8_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: subq $32, %rsp +; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %r15d -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbl %ebx, %r15d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $255, %ebp -; CHECK-NEXT: cmoval %ebp, %r15d -; CHECK-NEXT: shll $8, %r15d -; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movzbl %al, %r14d -; CHECK-NEXT: orl %r15d, %r14d +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movzbl %al, %r15d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movzbl %al, %r12d -; CHECK-NEXT: shll $8, %r12d -; CHECK-NEXT: orl %r15d, %r12d +; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movzbl %al, %r15d -; CHECK-NEXT: shll $16, %r15d -; CHECK-NEXT: orl %r12d, %r15d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0] +; CHECK-NEXT: xorps %xmm1, %xmm1 +; CHECK-NEXT: maxps %xmm1, %xmm0 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: shll $24, %eax -; CHECK-NEXT: orl %r15d, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: pinsrw $2, %r14d, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %r14d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %r14d -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %r14d -; CHECK-NEXT: shll $8, %r14d +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movzbl %al, %eax -; CHECK-NEXT: orl %r14d, %eax -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: pinsrw $3, %eax, %xmm0 -; CHECK-NEXT: addq $32, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: popq %rbp +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: psrld $16, %xmm0 +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: maxps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: minps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-NEXT: cvttps2dq %xmm1, %xmm0 +; CHECK-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: packuswb %xmm0, %xmm0 +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i8_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vmaxph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2.55E+2,2.55E+2,2.55E+2,2.55E+2,2.55E+2,2.55E+2,2.55E+2,2.55E+2] +; AVX512-NEXT: vminph %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> %f) ret <8 x i8> %x } @@ -859,6 +985,20 @@ ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i16_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnleph %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vcvttph2dq %xmm0, %ymm1 +; AVX512-NEXT: vpmovdw %ymm1, %xmm1 {%k1} {z} +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %x = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %f) ret <8 x i16> %x } @@ -866,107 +1006,118 @@ define <8 x i32> @test_unsigned_v8i32_v8f16(<8 x half> %f) nounwind { ; CHECK-LABEL: test_unsigned_v8i32_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: subq $72, %rsp ; CHECK-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: psrlq $48, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: xorl %ebx, %ebx -; CHECK-NEXT: xorps %xmm1, %xmm1 -; CHECK-NEXT: ucomiss %xmm1, %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: movl $-1, %ebp -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: cmpnleps %xmm0, %xmm3 +; CHECK-NEXT: movaps {{.*#+}} xmm5 = [4.29496704E+9,4.29496704E+9,4.29496704E+9,4.29496704E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm5 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm3 +; CHECK-NEXT: orps %xmm3, %xmm5 +; CHECK-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrld $16, %xmm0 ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; CHECK-NEXT: cvttps2dq %xmm3, %xmm0 +; CHECK-NEXT: movdqa %xmm0, %xmm1 +; CHECK-NEXT: psrad $31, %xmm1 +; CHECK-NEXT: movdqa %xmm3, %xmm2 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; CHECK-NEXT: cvttps2dq %xmm2, %xmm2 +; CHECK-NEXT: pand %xmm1, %xmm2 +; CHECK-NEXT: por %xmm0, %xmm2 +; CHECK-NEXT: pxor %xmm0, %xmm0 +; CHECK-NEXT: cmpnleps %xmm3, %xmm0 +; CHECK-NEXT: andnps %xmm2, %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm1 = [4.29496704E+9,4.29496704E+9,4.29496704E+9,4.29496704E+9] +; CHECK-NEXT: cmpltps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm0, %xmm1 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] -; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] -; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: xorps %xmm3, %xmm3 +; CHECK-NEXT: cmpnleps %xmm0, %xmm3 +; CHECK-NEXT: movaps {{.*#+}} xmm4 = [4.29496704E+9,4.29496704E+9,4.29496704E+9,4.29496704E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm4 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm3 +; CHECK-NEXT: orps %xmm3, %xmm4 +; CHECK-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; CHECK-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: callq __extendhfsf2@PLT -; CHECK-NEXT: cvttss2si %xmm0, %rax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmovbl %ebx, %eax -; CHECK-NEXT: ucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-NEXT: cmoval %ebp, %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] +; CHECK-NEXT: cvttps2dq %xmm0, %xmm1 +; CHECK-NEXT: movdqa %xmm1, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: xorps %xmm4, %xmm4 +; CHECK-NEXT: cmpnleps %xmm0, %xmm4 +; CHECK-NEXT: movaps {{.*#+}} xmm3 = [4.29496704E+9,4.29496704E+9,4.29496704E+9,4.29496704E+9] +; CHECK-NEXT: cmpltps %xmm0, %xmm3 +; CHECK-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-NEXT: cvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: pand %xmm2, %xmm0 +; CHECK-NEXT: por %xmm1, %xmm0 +; CHECK-NEXT: andnps %xmm0, %xmm4 +; CHECK-NEXT: movaps %xmm3, %xmm1 +; CHECK-NEXT: orps %xmm4, %xmm1 +; CHECK-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] ; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i32_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnleph %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vcvttph2udq %xmm0, %ymm1 {%k1} {z} +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} +; AVX512-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512-NEXT: retq %x = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> %f) ret <8 x i32> %x } @@ -1129,6 +1280,18 @@ ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r14 ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i64_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vcmpnleph %xmm0, %xmm1, %k0 +; AVX512-NEXT: knotb %k0, %k1 +; AVX512-NEXT: vcvttph2uqq %xmm0, %zmm1 {%k1} {z} +; AVX512-NEXT: vcmpgtph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %k1 +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: retq %x = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> %f) ret <8 x i64> %x } @@ -1302,6 +1465,174 @@ ; CHECK-NEXT: popq %r15 ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq +; +; AVX512-LABEL: test_unsigned_v8i128_v8f16: +; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbp +; AVX512-NEXT: pushq %r15 +; AVX512-NEXT: pushq %r14 +; AVX512-NEXT: pushq %r13 +; AVX512-NEXT: pushq %r12 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: subq $104, %rsp +; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512-NEXT: movq %rdi, %rbx +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: xorl %r12d, %r12d +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 4-byte Reload +; AVX512-NEXT: # xmm1 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss %xmm0, %xmm1 +; AVX512-NEXT: cmovbq %r12, %rdx +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; AVX512-NEXT: movq $-1, %r13 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %r13, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,1,3,3] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rdx +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %r13, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpsrlq $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rdx +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %r13, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[1,0] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rdx +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %r13, %rdx +; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vpsrldq $10, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rdx, %rbp +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %rbp +; AVX512-NEXT: cmovbq %r12, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: cmovaq %r13, %rbp +; AVX512-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[3,3,3,3] +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %r14 +; AVX512-NEXT: movq %rdx, %r15 +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovbq %r12, %r15 +; AVX512-NEXT: cmovbq %r12, %r14 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: cmovaq %r13, %r14 +; AVX512-NEXT: cmovaq %r13, %r15 +; AVX512-NEXT: vpsrldq $14, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: movq %rax, %r12 +; AVX512-NEXT: movq %rdx, %r13 +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: cmovbq %rax, %r13 +; AVX512-NEXT: cmovbq %rax, %r12 +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movq $-1, %rax +; AVX512-NEXT: cmovaq %rax, %r12 +; AVX512-NEXT: cmovaq %rax, %r13 +; AVX512-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX512-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512-NEXT: callq __fixunssfti@PLT +; AVX512-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; AVX512-NEXT: # xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: cmovbq %rcx, %rdx +; AVX512-NEXT: cmovbq %rcx, %rax +; AVX512-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX512-NEXT: movq $-1, %rcx +; AVX512-NEXT: cmovaq %rcx, %rax +; AVX512-NEXT: cmovaq %rcx, %rdx +; AVX512-NEXT: movq %rdx, 8(%rbx) +; AVX512-NEXT: movq %rax, (%rbx) +; AVX512-NEXT: movq %r13, 120(%rbx) +; AVX512-NEXT: movq %r12, 112(%rbx) +; AVX512-NEXT: movq %r15, 104(%rbx) +; AVX512-NEXT: movq %r14, 96(%rbx) +; AVX512-NEXT: movq %rbp, 88(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 80(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 72(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 64(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 56(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 48(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 40(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 32(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 24(%rbx) +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movq %rax, 16(%rbx) +; AVX512-NEXT: movq %rbx, %rax +; AVX512-NEXT: addq $104, %rsp +; AVX512-NEXT: popq %rbx +; AVX512-NEXT: popq %r12 +; AVX512-NEXT: popq %r13 +; AVX512-NEXT: popq %r14 +; AVX512-NEXT: popq %r15 +; AVX512-NEXT: popq %rbp +; AVX512-NEXT: retq %x = call <8 x i128> @llvm.fptoui.sat.v8i128.v8f16(<8 x half> %f) ret <8 x i128> %x }