diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45449,9 +45449,9 @@ return FPOpcode; } -/// If both input operands of a logic op are being cast from floating point -/// types, try to convert this into a floating point logic node to avoid -/// unnecessary moves from SSE to integer registers. +/// If both input operands of a logic op are being cast from floating-point +/// types or FP compares, try to convert this into a floating-point logic node +/// to avoid unnecessary moves from SSE to integer registers. static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -45460,10 +45460,8 @@ SDValue N1 = N->getOperand(1); SDLoc DL(N); - if (N0.getOpcode() != ISD::BITCAST || N1.getOpcode() != ISD::BITCAST) - return SDValue(); - - if (DCI.isBeforeLegalizeOps()) + if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) || + (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC))) return SDValue(); SDValue N00 = N0.getOperand(0); @@ -45477,9 +45475,39 @@ (Subtarget.hasFP16() && N00Type == MVT::f16))) return SDValue(); - unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); - SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); - return DAG.getBitcast(VT, FPLogic); + if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) { + unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); + SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); + return DAG.getBitcast(VT, FPLogic); + } + + // The vector ISA for FP predicates is incomplete before AVX, so converting + // COMIS* to CMPS* may not be a win before AVX. + // TODO: Check types/predicates to see if they are available with SSE/SSE2. + if (!Subtarget.hasAVX() || VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || + !N0.hasOneUse() || !N1.hasOneUse()) + return SDValue(); + + // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*) + // and vector logic: + // logic (setcc N00, N01), (setcc N10, N11) --> + // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0 + unsigned NumElts = 128 / N00Type.getSizeInBits(); + EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts); + EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); + SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL); + SDValue N01 = N0.getOperand(1); + SDValue N11 = N1.getOperand(1); + SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00); + SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01); + SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10); + SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11); + SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, + cast(N0.getOperand(2))->get()); + SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, + cast(N1.getOperand(2))->get()); + SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex); } // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y)) diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll --- a/llvm/test/CodeGen/X86/fcmp-logic.ll +++ b/llvm/test/CodeGen/X86/fcmp-logic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=SSE2 -; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 define i1 @olt_ole_and_f32(float %w, float %x, float %y, float %z) { ; SSE2-LABEL: olt_ole_and_f32: @@ -13,14 +13,27 @@ ; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: olt_ole_and_f32: -; AVX: # %bb.0: -; AVX-NEXT: vucomiss %xmm0, %xmm1 -; AVX-NEXT: seta %cl -; AVX-NEXT: vucomiss %xmm2, %xmm3 -; AVX-NEXT: setae %al -; AVX-NEXT: andb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: olt_ole_and_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpleps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: olt_ole_and_f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpltps %zmm1, %zmm0, %k1 +; AVX512-NEXT: vcmpleps %zmm3, %zmm2, %k0 {%k1} +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp olt float %w, %x %f2 = fcmp ole float %y, %z %r = and i1 %f1, %f2 @@ -39,16 +52,28 @@ ; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: oge_oeq_or_f32: -; AVX: # %bb.0: -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: setae %cl -; AVX-NEXT: vucomiss %xmm3, %xmm2 -; AVX-NEXT: setnp %dl -; AVX-NEXT: sete %al -; AVX-NEXT: andb %dl, %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: oge_oeq_or_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpeqps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpleps %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: oge_oeq_or_f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpeqps %zmm3, %zmm2, %k0 +; AVX512-NEXT: vcmpleps %zmm0, %zmm1, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp oge float %w, %x %f2 = fcmp oeq float %y, %z %r = or i1 %f1, %f2 @@ -65,14 +90,28 @@ ; SSE2-NEXT: xorb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: ord_one_xor_f32: -; AVX: # %bb.0: -; AVX-NEXT: vucomiss %xmm1, %xmm0 -; AVX-NEXT: setnp %cl -; AVX-NEXT: vucomiss %xmm3, %xmm2 -; AVX-NEXT: setne %al -; AVX-NEXT: xorb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: ord_one_xor_f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpneq_oqps %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpordps %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: ord_one_xor_f32: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpneq_oqps %zmm3, %zmm2, %k0 +; AVX512-NEXT: vcmpordps %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp ord float %w, %x %f2 = fcmp one float %y, %z %r = xor i1 %f1, %f2 @@ -91,16 +130,27 @@ ; SSE2-NEXT: andb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: une_ugt_and_f64: -; AVX: # %bb.0: -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: vucomisd %xmm2, %xmm3 -; AVX-NEXT: setb %al -; AVX-NEXT: andb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: une_ugt_and_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnlepd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneqpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: une_ugt_and_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 +; AVX512-NEXT: vcmpnlepd %zmm3, %zmm2, %k0 {%k1} +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp une double %w, %x %f2 = fcmp ugt double %y, %z %r = and i1 %f1, %f2 @@ -117,14 +167,28 @@ ; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: ult_uge_or_f64: -; AVX: # %bb.0: -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setb %cl -; AVX-NEXT: vucomisd %xmm2, %xmm3 -; AVX-NEXT: setbe %al -; AVX-NEXT: orb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: ult_uge_or_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpnltpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpnlepd %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vorpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: ult_uge_or_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpnltpd %zmm3, %zmm2, %k0 +; AVX512-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 +; AVX512-NEXT: korw %k0, %k1, %k0 +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp ult double %w, %x %f2 = fcmp uge double %y, %z %r = or i1 %f1, %f2 @@ -143,22 +207,37 @@ ; SSE2-NEXT: xorb %cl, %al ; SSE2-NEXT: retq ; -; AVX-LABEL: une_uno_xor_f64: -; AVX: # %bb.0: -; AVX-NEXT: vucomisd %xmm1, %xmm0 -; AVX-NEXT: setp %al -; AVX-NEXT: setne %cl -; AVX-NEXT: orb %al, %cl -; AVX-NEXT: vucomisd %xmm3, %xmm2 -; AVX-NEXT: setp %al -; AVX-NEXT: xorb %cl, %al -; AVX-NEXT: retq +; AVX1-LABEL: une_uno_xor_f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vcmpunordpd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vcmpneqpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: # kill: def $al killed $al killed $eax +; AVX1-NEXT: retq +; +; AVX512-LABEL: une_uno_xor_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $xmm3 killed $xmm3 def $zmm3 +; AVX512-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 +; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512-NEXT: vcmpunordpd %zmm3, %zmm2, %k0 +; AVX512-NEXT: vcmpneqpd %zmm1, %zmm0, %k1 +; AVX512-NEXT: kxorw %k0, %k1, %k0 +; AVX512-NEXT: kmovw %k0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq %f1 = fcmp une double %w, %x %f2 = fcmp uno double %y, %z %r = xor i1 %f1, %f2 ret i1 %r } +; This uses ucomis because the types do not match. +; TODO: Merge down to narrow type? + define i1 @olt_olt_and_f32_f64(float %w, float %x, double %y, double %z) { ; SSE2-LABEL: olt_olt_and_f32_f64: ; SSE2: # %bb.0: @@ -183,6 +262,8 @@ ret i1 %r } +; This uses ucomis because of extra uses. + define i1 @une_uno_xor_f64_use1(double %w, double %x, double %y, double %z, i1* %p) { ; SSE2-LABEL: une_uno_xor_f64_use1: ; SSE2: # %bb.0: @@ -214,6 +295,8 @@ ret i1 %r } +; This uses ucomis because of extra uses. + define i1 @une_uno_xor_f64_use2(double %w, double %x, double %y, double %z, i1* %p) { ; SSE2-LABEL: une_uno_xor_f64_use2: ; SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll --- a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll +++ b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll @@ -322,12 +322,11 @@ ; ALL-LABEL: test_zext_cmp11: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vucomisd %xmm2, %xmm0 -; ALL-NEXT: sete %al -; ALL-NEXT: vucomisd %xmm2, %xmm1 -; ALL-NEXT: sete %cl -; ALL-NEXT: orb %al, %cl -; ALL-NEXT: movzbl %cl, %eax +; ALL-NEXT: vcmpeqpd %xmm2, %xmm1, %xmm1 +; ALL-NEXT: vcmpeqpd %xmm2, %xmm0, %xmm0 +; ALL-NEXT: vorpd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: andl $1, %eax ; ALL-NEXT: retq entry: %cmp = fcmp fast oeq double %a, 0.000000e+00