Index: llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -805,6 +805,7 @@ SDValue WidenVSELECTAndMask(SDNode *N); SDValue WidenVecRes_SELECT_CC(SDNode* N); SDValue WidenVecRes_SETCC(SDNode* N); + SDValue WidenVecRes_STRICT_FSETCC(SDNode* N); SDValue WidenVecRes_UNDEF(SDNode *N); SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N); @@ -834,6 +835,7 @@ SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo); SDValue WidenVecOp_SETCC(SDNode* N); + SDValue WidenVecOp_STRICT_FSETCC(SDNode* N); SDValue WidenVecOp_VSELECT(SDNode *N); SDValue WidenVecOp_Convert(SDNode *N); Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3039,6 +3039,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) { switch (N->getOpcode()) { + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: + return WidenVecRes_STRICT_FSETCC(N); case ISD::STRICT_FP_EXTEND: case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_TO_SINT: @@ -4116,6 +4119,47 @@ WidenVT, InOp1, InOp2, N->getOperand(2)); } +SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) { + assert(N->getValueType(0).isVector() && + N->getOperand(1).getValueType().isVector() && + "Operands must be vectors"); + EVT VT = N->getValueType(0); + EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + unsigned WidenNumElts = WidenVT.getVectorNumElements(); + unsigned NumElts = VT.getVectorNumElements(); + EVT EltVT = VT.getVectorElementType(); + + SDLoc dl(N); + SDValue Chain = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + SDValue CC = N->getOperand(3); + EVT TmpEltVT = LHS.getValueType().getVectorElementType(); + + // Fully unroll and reassemble. + SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT)); + SmallVector<SDValue, 8> Chains(NumElts); + for (unsigned i = 0; i != NumElts; ++i) { + SDValue LHSElem = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue RHSElem = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + + Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other}, + {Chain, LHSElem, RHSElem, CC}); + Chains[i] = Scalars[i].getValue(1); + Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i], + DAG.getBoolConstant(true, dl, EltVT, VT), + DAG.getBoolConstant(false, dl, EltVT, VT)); + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + ReplaceValueWith(SDValue(N, 1), NewChain); + + return DAG.getBuildVector(WidenVT, dl, Scalars); +} //===----------------------------------------------------------------------===// // Widen Vector Operand @@ -4147,6 +4191,8 @@ case ISD::MGATHER: Res = WidenVecOp_MGATHER(N, OpNo); break; case ISD::MSCATTER: Res = WidenVecOp_MSCATTER(N, OpNo); break; case ISD::SETCC: Res = WidenVecOp_SETCC(N); break; + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: Res = WidenVecOp_STRICT_FSETCC(N); break; case ISD::VSELECT: Res = WidenVecOp_VSELECT(N); break; case ISD::FCOPYSIGN: Res = WidenVecOp_FCOPYSIGN(N); break; @@ -4590,6 +4636,44 @@ return DAG.getNode(ExtendCode, dl, VT, CC); } +SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) { + SDValue Chain = N->getOperand(0); + SDValue LHS = GetWidenedVector(N->getOperand(1)); + SDValue RHS = GetWidenedVector(N->getOperand(2)); + SDValue CC = N->getOperand(3); + SDLoc dl(N); + + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + EVT TmpEltVT = LHS.getValueType().getVectorElementType(); + unsigned NumElts = VT.getVectorNumElements(); + + // Unroll into a build vector. + SmallVector<SDValue, 8> Scalars(NumElts); + SmallVector<SDValue, 8> Chains(NumElts); + + for (unsigned i = 0; i != NumElts; ++i) { + SDValue LHSElem = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + SDValue RHSElem = DAG.getNode( + ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS, + DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))); + + Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other}, + {Chain, LHSElem, RHSElem, CC}); + Chains[i] = Scalars[i].getValue(1); + Scalars[i] = DAG.getSelect(dl, EltVT, Scalars[i], + DAG.getBoolConstant(true, dl, EltVT, VT), + DAG.getBoolConstant(false, dl, EltVT, VT)); + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + ReplaceValueWith(SDValue(N, 1), NewChain); + + return DAG.getBuildVector(VT, dl, Scalars); +} + SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) { SDLoc dl(N); SDValue Op = GetWidenedVector(N->getOperand(0)); Index: llvm/test/CodeGen/X86/vec-strict-128-cmp.ll =================================================================== --- llvm/test/CodeGen/X86/vec-strict-128-cmp.ll +++ llvm/test/CodeGen/X86/vec-strict-128-cmp.ll @@ -6,6 +6,302 @@ ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-32 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX512-64 +define <2 x i32> @test_v2f32_ogt_s(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, <2 x float> %f2) #0 { +; SSE-32-LABEL: test_v2f32_ogt_s: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: andl $-16, %esp +; SSE-32-NEXT: subl $16, %esp +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: movl $-1, %ecx +; SSE-32-NEXT: movl $0, %edx +; SSE-32-NEXT: cmoval %ecx, %edx +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-32-NEXT: comiss %xmm3, %xmm2 +; SSE-32-NEXT: cmoval %ecx, %eax +; SSE-32-NEXT: movd %eax, %xmm2 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_v2f32_ogt_s: +; SSE-64: # %bb.0: +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: comiss %xmm3, %xmm2 +; SSE-64-NEXT: movl $-1, %ecx +; SSE-64-NEXT: movl $0, %edx +; SSE-64-NEXT: cmoval %ecx, %edx +; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-64-NEXT: comiss %xmm3, %xmm2 +; SSE-64-NEXT: cmoval %ecx, %eax +; SSE-64-NEXT: movd %eax, %xmm2 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-64-NEXT: pand %xmm4, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm4 +; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: test_v2f32_ogt_s: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: andl $-16, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-32-NEXT: xorl %eax, %eax +; AVX-32-NEXT: vcomiss %xmm4, %xmm5 +; AVX-32-NEXT: movl $-1, %ecx +; AVX-32-NEXT: movl $0, %edx +; AVX-32-NEXT: cmoval %ecx, %edx +; AVX-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX-32-NEXT: cmoval %ecx, %eax +; AVX-32-NEXT: vmovd %eax, %xmm2 +; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 +; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_v2f32_ogt_s: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vcomiss %xmm4, %xmm5 +; AVX-64-NEXT: movl $-1, %ecx +; AVX-64-NEXT: movl $0, %edx +; AVX-64-NEXT: cmoval %ecx, %edx +; AVX-64-NEXT: vcomiss %xmm3, %xmm2 +; AVX-64-NEXT: cmoval %ecx, %eax +; AVX-64-NEXT: vmovd %eax, %xmm2 +; AVX-64-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 +; AVX-64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; +; AVX512-32-LABEL: test_v2f32_ogt_s: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-16, %esp +; AVX512-32-NEXT: subl $16, %esp +; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512-32-NEXT: movw $-3, %ax +; AVX512-32-NEXT: kmovw %eax, %k0 +; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-32-NEXT: seta %al +; AVX512-32-NEXT: andl $1, %eax +; AVX512-32-NEXT: kmovw %eax, %k1 +; AVX512-32-NEXT: kandw %k0, %k1, %k0 +; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-32-NEXT: seta %al +; AVX512-32-NEXT: kmovw %eax, %k1 +; AVX512-32-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-32-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-32-NEXT: korw %k1, %k0, %k1 +; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v2f32_ogt_s: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: movw $-3, %ax +; AVX512-64-NEXT: kmovw %eax, %k0 +; AVX512-64-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-64-NEXT: seta %al +; AVX512-64-NEXT: andl $1, %eax +; AVX512-64-NEXT: kmovw %eax, %k1 +; AVX512-64-NEXT: kandw %k0, %k1, %k0 +; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512-64-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-64-NEXT: seta %al +; AVX512-64-NEXT: kmovw %eax, %k1 +; AVX512-64-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-64-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-64-NEXT: korw %k1, %k0, %k1 +; AVX512-64-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f32( + <2 x float> %f1, <2 x float> %f2, metadata !"ogt", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %res +} + +define <2 x i32> @test_v2f32_oeq_q(<2 x i32> %a, <2 x i32> %b, <2 x float> %f1, <2 x float> %f2) #0 { +; SSE-32-LABEL: test_v2f32_oeq_q: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: andl $-16, %esp +; SSE-32-NEXT: subl $16, %esp +; SSE-32-NEXT: movaps 8(%ebp), %xmm3 +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: movl $-1, %ecx +; SSE-32-NEXT: movl $-1, %edx +; SSE-32-NEXT: cmovnel %eax, %edx +; SSE-32-NEXT: cmovpl %eax, %edx +; SSE-32-NEXT: movd %edx, %xmm4 +; SSE-32-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-32-NEXT: ucomiss %xmm3, %xmm2 +; SSE-32-NEXT: cmovnel %eax, %ecx +; SSE-32-NEXT: cmovpl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm2 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-32-NEXT: pand %xmm4, %xmm0 +; SSE-32-NEXT: pandn %xmm1, %xmm4 +; SSE-32-NEXT: por %xmm4, %xmm0 +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: test_v2f32_oeq_q: +; SSE-64: # %bb.0: +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomiss %xmm3, %xmm2 +; SSE-64-NEXT: movl $-1, %ecx +; SSE-64-NEXT: movl $-1, %edx +; SSE-64-NEXT: cmovnel %eax, %edx +; SSE-64-NEXT: cmovpl %eax, %edx +; SSE-64-NEXT: movd %edx, %xmm4 +; SSE-64-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-64-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-64-NEXT: ucomiss %xmm3, %xmm2 +; SSE-64-NEXT: cmovnel %eax, %ecx +; SSE-64-NEXT: cmovpl %eax, %ecx +; SSE-64-NEXT: movd %ecx, %xmm2 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] +; SSE-64-NEXT: pand %xmm4, %xmm0 +; SSE-64-NEXT: pandn %xmm1, %xmm4 +; SSE-64-NEXT: por %xmm4, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: test_v2f32_oeq_q: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: andl $-16, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-32-NEXT: xorl %eax, %eax +; AVX-32-NEXT: vucomiss %xmm4, %xmm5 +; AVX-32-NEXT: movl $-1, %ecx +; AVX-32-NEXT: movl $-1, %edx +; AVX-32-NEXT: cmovnel %eax, %edx +; AVX-32-NEXT: cmovpl %eax, %edx +; AVX-32-NEXT: vucomiss %xmm3, %xmm2 +; AVX-32-NEXT: cmovnel %eax, %ecx +; AVX-32-NEXT: cmovpl %eax, %ecx +; AVX-32-NEXT: vmovd %ecx, %xmm2 +; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 +; AVX-32-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: test_v2f32_oeq_q: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm4, %xmm5 +; AVX-64-NEXT: movl $-1, %ecx +; AVX-64-NEXT: movl $-1, %edx +; AVX-64-NEXT: cmovnel %eax, %edx +; AVX-64-NEXT: cmovpl %eax, %edx +; AVX-64-NEXT: vucomiss %xmm3, %xmm2 +; AVX-64-NEXT: cmovnel %eax, %ecx +; AVX-64-NEXT: cmovpl %eax, %ecx +; AVX-64-NEXT: vmovd %ecx, %xmm2 +; AVX-64-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 +; AVX-64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; +; AVX512-32-LABEL: test_v2f32_oeq_q: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: pushl %ebp +; AVX512-32-NEXT: movl %esp, %ebp +; AVX512-32-NEXT: andl $-16, %esp +; AVX512-32-NEXT: subl $16, %esp +; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3 +; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512-32-NEXT: vucomiss %xmm4, %xmm5 +; AVX512-32-NEXT: setnp %al +; AVX512-32-NEXT: sete %cl +; AVX512-32-NEXT: testb %al, %cl +; AVX512-32-NEXT: setne %al +; AVX512-32-NEXT: kmovw %eax, %k0 +; AVX512-32-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-32-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-32-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-32-NEXT: setnp %al +; AVX512-32-NEXT: sete %cl +; AVX512-32-NEXT: testb %al, %cl +; AVX512-32-NEXT: setne %al +; AVX512-32-NEXT: andl $1, %eax +; AVX512-32-NEXT: kmovw %eax, %k1 +; AVX512-32-NEXT: movw $-3, %ax +; AVX512-32-NEXT: kmovw %eax, %k2 +; AVX512-32-NEXT: kandw %k2, %k1, %k1 +; AVX512-32-NEXT: korw %k0, %k1, %k1 +; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; AVX512-32-NEXT: movl %ebp, %esp +; AVX512-32-NEXT: popl %ebp +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_v2f32_oeq_q: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512-64-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX512-64-NEXT: vucomiss %xmm4, %xmm5 +; AVX512-64-NEXT: setnp %al +; AVX512-64-NEXT: sete %cl +; AVX512-64-NEXT: testb %al, %cl +; AVX512-64-NEXT: setne %al +; AVX512-64-NEXT: kmovw %eax, %k0 +; AVX512-64-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-64-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-64-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-64-NEXT: setnp %al +; AVX512-64-NEXT: sete %cl +; AVX512-64-NEXT: testb %al, %cl +; AVX512-64-NEXT: setne %al +; AVX512-64-NEXT: andl $1, %eax +; AVX512-64-NEXT: kmovw %eax, %k1 +; AVX512-64-NEXT: movw $-3, %ax +; AVX512-64-NEXT: kmovw %eax, %k2 +; AVX512-64-NEXT: kandw %k2, %k1, %k1 +; AVX512-64-NEXT: korw %k0, %k1, %k1 +; AVX512-64-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} +; AVX512-64-NEXT: retq + %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f32( + <2 x float> %f1, <2 x float> %f2, metadata !"oeq", + metadata !"fpexcept.strict") #0 + %res = select <2 x i1> %cond, <2 x i32> %a, <2 x i32> %b + ret <2 x i32> %res +} + define <4 x i32> @test_v4f32_oeq_q(<4 x i32> %a, <4 x i32> %b, <4 x float> %f1, <4 x float> %f2) #0 { ; SSE-32-LABEL: test_v4f32_oeq_q: ; SSE-32: # %bb.0: @@ -4456,7 +4752,9 @@ attributes #0 = { strictfp nounwind } +declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f32(<2 x float>, <2 x float>, metadata, metadata) declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f32(<4 x float>, <4 x float>, metadata, metadata) declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f64(<2 x double>, <2 x double>, metadata, metadata) +declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f32(<2 x float>, <2 x float>, metadata, metadata) declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f32(<4 x float>, <4 x float>, metadata, metadata) declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f64(<2 x double>, <2 x double>, metadata, metadata)