diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -887,6 +887,7 @@ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::VECREDUCE_ADD); + setTargetDAGCombine(ISD::VECREDUCE_OR); setTargetDAGCombine(ISD::STEP_VECTOR); setTargetDAGCombine(ISD::MGATHER); @@ -13275,10 +13276,83 @@ return SDValue(); } +// If V is a sign-extend of a scalable predicate vector (possibly masked as +// a fixed-width vector), return the original scalable predicate vector. +// If no such predicate is found, returns SDValue(). +static SDValue findScalablePredicateOperand(SDValue V, SelectionDAG &DAG) { + const auto &Subtarget = + static_cast(DAG.getSubtarget()); + if (!V.getValueType().isVector() || !Subtarget.hasSVE()) + return SDValue(); + + // Look through truncates and scalable -> fixed conversion. + while ((V.getOpcode() == ISD::TRUNCATE || + (V.getValueType().isFixedLengthVector() && + V.getOpcode() == ISD::EXTRACT_SUBVECTOR)) && + V.hasOneUse()) + V = V.getOperand(0); + + if (V.getValueType().isScalableVector() && + V.getOpcode() == ISD::SIGN_EXTEND && + V.getOperand(0).getValueType().getVectorElementType() == MVT::i1) + return V.getOperand(0); + + if (V.getValueType().isFixedLengthVector() && + ISD::isConstantSplatVectorAllOnes(V.getNode())) { + EVT ScalableVT = getContainerForFixedLengthVector(DAG, V.getValueType()) + .changeVectorElementType(MVT::i1); + return DAG.getConstant(-1, SDLoc(V.getNode()), ScalableVT); + } + + return SDValue(); +} + +static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, + EVT VT); +static SDValue performVecreduceAndOrCombine(SDNode *N, + const AArch64TargetLowering &TLI, + SelectionDAG &DAG) { + EVT OpVT = N->getOperand(0).getValueType(); + if (!OpVT.isFixedLengthVector()) + return SDValue(); + + // Try to perform the operation on SVE predicate vectors, if available. + if (SDValue Pred = findScalablePredicateOperand(N->getOperand(0), DAG)) { + SDLoc DL(N); + EVT PromVT = getPromotedVTForPredicate(Pred.getValueType()); + SDValue PredForVL = getPredicateForFixedLengthVector( + DAG, DL, OpVT.changeVectorElementType(PromVT.getVectorElementType())); + + // If not all bits in the scalable vector are defined, we need to + // manually define these to be 0 or 1. + if (!TLI.isAllActivePredicate(DAG, PredForVL)) { + EVT PredVT = PredForVL.getValueType(); + Pred = DAG.getNode(ISD::AND, DL, PredVT, Pred, PredForVL); + } + + return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), Pred); + } + + return SDValue(); +} static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { + // Try to perform the operation on SVE predicate vectors, if available. + SDValue NewLHS, NewRHS; + if (N->getValueType(0).isFixedLengthVector() && + (NewLHS = findScalablePredicateOperand(N->getOperand(0), DAG)) && + (NewRHS = findScalablePredicateOperand(N->getOperand(1), DAG))) { + assert(!(isa(NewLHS) && isa(NewRHS)) && + "Expected nodes to have been constant folded"); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, N->getValueType(0)); + SDValue PredXOR = + DAG.getNode(ISD::XOR, SDLoc(N), NewLHS.getValueType(), NewLHS, NewRHS); + SDValue Ext = DAG.getSExtOrTrunc(PredXOR, SDLoc(N), ContainerVT); + return convertFromScalableVector(DAG, N->getValueType(0), Ext); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -14026,6 +14100,20 @@ if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; + // Try to perform the operation on SVE predicate vectors, if available. + SDValue NewLHS, NewRHS; + if (VT.isFixedLengthVector() && + (NewLHS = findScalablePredicateOperand(N->getOperand(0), DAG)) && + (NewRHS = findScalablePredicateOperand(N->getOperand(1), DAG))) { + assert(!(isa(NewLHS) && isa(NewRHS)) && + "Expected nodes to have been constant folded"); + EVT ContainerVT = getContainerForFixedLengthVector(DAG, N->getValueType(0)); + SDValue PredOR = + DAG.getNode(ISD::OR, SDLoc(N), NewLHS.getValueType(), NewLHS, NewRHS); + SDValue Ext = DAG.getSExtOrTrunc(PredOR, SDLoc(N), ContainerVT); + return convertFromScalableVector(DAG, N->getValueType(0), Ext); + } + return SDValue(); } @@ -14280,6 +14368,17 @@ SelectionDAG &DAG) { EVT VT = N->getValueType(0); + SDValue Pred; + if (N->getValueType(0).isFixedLengthVector() && + (Pred = findScalablePredicateOperand(N->getOperand(0), DAG))) { + // This pattern can be recognised, but the users may not be automatically + // revisited by the DAGCombiner, so add the users to the worklist. + assert(!isa(Pred) && + "Expected node to have been constant folded"); + for (auto *Use : N->uses()) + DCI.AddToWorklist(Use); + } + // Since we are looking for a right shift by a constant value of 1 and we are // operating on types at least 16 bits in length (sign/zero extended OpA and // OpB, which are at least 8 bits), it follows that the truncate will always @@ -17022,7 +17121,8 @@ return performCONDCombine(N, DCI, DAG, 2, 3); } -static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue performSETCCCombine(SDNode *N, bool UseSVEForFixedLengthCompares, + SelectionDAG &DAG) { assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -17048,6 +17148,23 @@ return DAG.getZExtOrTrunc(CSEL, DL, N->getValueType(0)); } + // For fixed-length vectors, convert to a scalable SETCC operation early so + // that we can propagate any possible sign-extends to the uses of SETCC. + if (UseSVEForFixedLengthCompares && N->hasOneUse() && + DAG.getTargetLoweringInfo().isTypeLegal(N->getValueType(0))) { + SDLoc DL(N); + EVT ContainerVT = + getContainerForFixedLengthVector(DAG, N->getOperand(0).getValueType()); + EVT PredVT = ContainerVT.changeVectorElementType(MVT::i1); + auto Op1 = convertToScalableVector(DAG, ContainerVT, N->getOperand(0)); + auto Op2 = convertToScalableVector(DAG, ContainerVT, N->getOperand(1)); + auto Cmp = DAG.getNode(ISD::SETCC, DL, PredVT, Op1, Op2, N->getOperand(2)); + EVT PromoteVT = ContainerVT.changeTypeToInteger(); + auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, + N->getOperand(0).getValueType()); + return convertFromScalableVector(DAG, N->getValueType(0), Promote); + } + return SDValue(); } @@ -17089,8 +17206,6 @@ return SDValue(); } -static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, - EVT VT); // Pattern match utility function to return if V is a conversion of a // fixed-width vector -> scalable vector. static bool isConvertToScalableVector(SDValue V) { @@ -18081,7 +18196,8 @@ case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::SETCC: - return performSETCCCombine(N, DAG); + return performSETCCCombine( + N, useSVEForFixedLengthVectorVT(N->getValueType(0)), DAG); case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); @@ -18146,6 +18262,8 @@ return performVecReduceAddCombine(N, DCI.DAG, Subtarget); case AArch64ISD::UADDV: return performUADDVCombine(N, DAG); + case ISD::VECREDUCE_OR: + return performVecreduceAndOrCombine(N, *this, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -365,13 +365,12 @@ ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmne p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -412,13 +411,12 @@ ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -459,13 +457,12 @@ ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -506,13 +503,12 @@ ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -553,13 +549,12 @@ ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -600,13 +595,12 @@ ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a @@ -647,13 +641,12 @@ ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl16 -; CHECK-NEXT: mov w8, #65535 +; CHECK-NEXT: ptrue p2.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: fcmuo p1.h, p0/z, z0.h, z1.h -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: not p1.b, p2/z, p1.b ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d ; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x half>, <16 x half>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll @@ -6,21 +6,21 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #8 ; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: not p1.b, p2/z, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: eor z1.d, z2.d, z1.d +; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: mov v1.d[1], v0.d[0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: orv b0, p0, z1.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -36,17 +36,11 @@ ; CHECK-LABEL: ptest_v16i1_512bit_min_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: bic p0.b, p0/z, p0.b, p1.b +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %v0 = bitcast float* %a to <16 x float>* %v1 = load <16 x float>, <16 x float>* %v0, align 4 @@ -59,17 +53,11 @@ ; CHECK-LABEL: ptest_v16i1_512bit_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p0.b, p0/z, p1.b +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %v0 = bitcast float* %a to <16 x float>* %v1 = load <16 x float>, <16 x float>* %v0, align 4 @@ -82,22 +70,17 @@ ; CHECK-LABEL: ptest_or_v16i1_512bit_min_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: eor z1.d, z2.d, z1.d -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orr z0.d, z0.d, z1.d -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: and w0, w8, #0x1 +; CHECK-NEXT: fcmeq p3.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: not p1.b, p2/z, p1.b +; CHECK-NEXT: not p2.b, p2/z, p3.b +; CHECK-NEXT: sel p1.b, p1, p1.b, p2.b +; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %v0 = bitcast float* %a to <16 x float>* %v1 = load <16 x float>, <16 x float>* %v0, align 4 @@ -123,12 +106,10 @@ ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z0.d, z0.d, z1.d +; CHECK-NEXT: bic p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b @@ -151,15 +132,14 @@ ; CHECK-LABEL: ptest_and_v16i1_512bit_min_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 +; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmeq p0.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z1.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z0.d, z0.d, z1.d -; CHECK-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: bic z0.d, z0.d, z1.d +; CHECK-NEXT: not p1.b, p2/z, p1.b +; CHECK-NEXT: bic p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b