Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1001,6 +1001,10 @@ setTargetDAGCombine(ISD::CTLZ); + setTargetDAGCombine(ISD::VECREDUCE_AND); + setTargetDAGCombine(ISD::VECREDUCE_OR); + setTargetDAGCombine(ISD::VECREDUCE_XOR); + // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = @@ -1165,8 +1169,14 @@ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); } setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); @@ -13176,6 +13186,85 @@ DAG.getConstant(0, DL, MVT::i64)); } +static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, + SDLoc DL, SelectionDAG &DAG) { + unsigned ScalarOpcode; + switch (Opcode) { + case ISD::VECREDUCE_AND: + ScalarOpcode = ISD::AND; + break; + case ISD::VECREDUCE_OR: + ScalarOpcode = ISD::OR; + break; + case ISD::VECREDUCE_XOR: + ScalarOpcode = ISD::XOR; + break; + default: + llvm_unreachable("Expected bitwise vector reduction"); + return SDValue(); + } + + EVT VecVT = Vec.getValueType(); + assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() && + "Expected power-of-2 length vector"); + + EVT ElemVT = VecVT.getVectorElementType(); + + SDValue Result; + unsigned NumElems = VecVT.getVectorNumElements(); + + // special case for boolean reductions + if (ElemVT == MVT::i1) { + // split large vectors into smaller ones + if (NumElems > 16) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); + return getVectorBitwiseReduce(Opcode, DAG.getNode(ScalarOpcode, DL, Lo.getValueType(), Lo, Hi), VT, DL, DAG); + } + + // Casting to i8 first leads to better codegen. + SDValue Extended = + DAG.getAnyExtOrTrunc(Vec, DL, VecVT.changeVectorElementType(MVT::i8)); + switch (ScalarOpcode) { + case ISD::AND: + Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, MVT::i8, Extended); + break; + case ISD::OR: + Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, MVT::i8, Extended); + break; + case ISD::XOR: + Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i8, Extended); + break; + default: + llvm_unreachable("Unexpected Opcode"); + } + + Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1); + } else { + SmallVector ShiftValues(NumElems, -1); + + SDValue Accumulator = Vec; + + // Iteratively apply the bitwise operator to the first and second half + // of the vector until only one element remains. + for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) { + // ShiftValues should contain `` + std::iota(ShiftValues.begin(), ShiftValues.begin() + Shift, Shift); + std::fill(ShiftValues.begin() + Shift, ShiftValues.end(), -1); + SDValue ShuffledVector = DAG.getVectorShuffle( + VecVT, DL, Accumulator, DAG.getUNDEF(VecVT), ShiftValues); + Accumulator = + DAG.getNode(ScalarOpcode, DL, VecVT, Accumulator, ShuffledVector); + } + + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Accumulator, + DAG.getConstant(0, DL, MVT::i64)); + } + + return DAG.getAnyExtOrTrunc(Result, DL, VT); +} + SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -13183,9 +13272,6 @@ // Try to lower fixed length reductions to SVE. EVT SrcVT = Src.getValueType(); bool OverrideNEON = Subtarget->forceStreamingCompatibleSVE() || - Op.getOpcode() == ISD::VECREDUCE_AND || - Op.getOpcode() == ISD::VECREDUCE_OR || - Op.getOpcode() == ISD::VECREDUCE_XOR || Op.getOpcode() == ISD::VECREDUCE_FADD || (Op.getOpcode() != ISD::VECREDUCE_ADD && SrcVT.getVectorElementType() == MVT::i64); @@ -13227,6 +13313,11 @@ // Lower NEON reductions. SDLoc dl(Op); switch (Op.getOpcode()) { + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0), + Op.getValueType(), dl, DAG); case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: @@ -20485,6 +20576,22 @@ Op0ExtV, Op1ExtV, Op->getOperand(2)); } +static SDValue +performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDValue Vec = N->getOperand(0); + if (DCI.isBeforeLegalize() && + Vec.getValueType().getVectorElementType() == MVT::i1 && + Vec.getValueType().isFixedLengthVector() && + Vec.getValueType().isPow2VectorType()) { + SDLoc DL(N); + return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL, + DAG); + } + + return SDValue(); +} + static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -21653,6 +21760,10 @@ default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return performVecReduceBitwiseCombine(N, DCI, DAG); case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI, DAG); Index: llvm/test/CodeGen/AArch64/dag-combine-setcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -193,10 +193,10 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 @@ -208,10 +208,10 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <16 x i8> %a, zeroinitializer %cast = bitcast <16 x i1> %cmp1 to i16 @@ -223,12 +223,12 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v32i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 ; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <32 x i8> %a, zeroinitializer %cast = bitcast <32 x i1> %cmp1 to i32 @@ -240,6 +240,7 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v64i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b ; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 ; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 @@ -247,9 +248,8 @@ ; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <64 x i8> %a, zeroinitializer %cast = bitcast <64 x i1> %cmp1 to i64 Index: llvm/test/CodeGen/AArch64/double_reduct.ll =================================================================== --- llvm/test/CodeGen/AArch64/double_reduct.ll +++ llvm/test/CodeGen/AArch64/double_reduct.ll @@ -129,11 +129,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) @@ -146,11 +146,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) @@ -163,11 +163,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) Index: llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll =================================================================== --- llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: unordered_floating_point_compare_on_v8f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: mvn v0.16b, v0.16b @@ -27,7 +27,7 @@ ; CHECK-LABEL: unordered_floating_point_compare_on_v16f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmgt v3.4s, v3.4s, #0.0 -; CHECK-NEXT: mov w8, #1 +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: fcmgt v2.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 @@ -49,7 +49,7 @@ ; CHECK-LABEL: unordered_floating_point_compare_on_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: fcmgt v3.4s, v3.4s, #0.0 -; CHECK-NEXT: mov w9, #1 +; CHECK-NEXT: mov w9, #1 // =0x1 ; CHECK-NEXT: fcmgt v2.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 @@ -59,10 +59,10 @@ ; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v1.8h, v4.8h, v5.8h +; CHECK-NEXT: uzp1 v1.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: uzp1 v1.16b, v1.16b, v6.16b +; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b Index: llvm/test/CodeGen/AArch64/reduce-and.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-and.ll +++ llvm/test/CodeGen/AArch64/reduce-and.ll @@ -20,6 +20,8 @@ define i1 @test_redand_v2i1(<2 x i1> %a) { ; CHECK-LABEL: test_redand_v2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -41,6 +43,7 @@ define i1 @test_redand_v4i1(<4 x i1> %a) { ; CHECK-LABEL: test_redand_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: uminv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -246,9 +249,15 @@ define i8 @test_redand_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redand_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w1 -; CHECK-NEXT: and w8, w8, w2 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: movi d0, #0xff00ff00ff00ff +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v3i8: @@ -264,13 +273,11 @@ ; CHECK-LABEL: test_redand_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i8: @@ -295,21 +302,13 @@ ; CHECK-LABEL: test_redand_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w11, w13, w12 -; CHECK-NEXT: and w9, w10, w11 -; CHECK-NEXT: and w8, w8, w14 -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w0, w8, w15 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i8: @@ -345,23 +344,15 @@ define i8 @test_redand_v16i8(<16 x i8> %a) { ; CHECK-LABEL: test_redand_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w10, w10, w11 -; CHECK-NEXT: and w11, w12, w13 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w10, w11, w14 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v16i8: @@ -399,23 +390,15 @@ ; CHECK-LABEL: test_redand_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w10, w10, w11 -; CHECK-NEXT: and w11, w12, w13 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w10, w11, w14 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v32i8: @@ -454,13 +437,11 @@ ; CHECK-LABEL: test_redand_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i16: @@ -484,15 +465,13 @@ define i16 @test_redand_v8i16(<8 x i16> %a) { ; CHECK-LABEL: test_redand_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w9, w10, w11 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i16: @@ -518,15 +497,13 @@ ; CHECK-LABEL: test_redand_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w9, w10, w11 -; CHECK-NEXT: and w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v16i16: @@ -553,9 +530,9 @@ ; CHECK-LABEL: test_redand_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v2i32: @@ -573,11 +550,11 @@ define i32 @test_redand_v4i32(<4 x i32> %a) { ; CHECK-LABEL: test_redand_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i32: @@ -597,11 +574,11 @@ ; CHECK-LABEL: test_redand_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i32: @@ -621,8 +598,8 @@ define i64 @test_redand_v2i64(<2 x i64> %a) { ; CHECK-LABEL: test_redand_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; @@ -641,8 +618,8 @@ ; CHECK-LABEL: test_redand_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; Index: llvm/test/CodeGen/AArch64/reduce-or.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-or.ll +++ llvm/test/CodeGen/AArch64/reduce-or.ll @@ -20,6 +20,8 @@ define i1 @test_redor_v2i1(<2 x i1> %a) { ; CHECK-LABEL: test_redor_v2i1: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -41,6 +43,7 @@ define i1 @test_redor_v4i1(<4 x i1> %a) { ; CHECK-LABEL: test_redor_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: umaxv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -246,8 +249,16 @@ define i8 @test_redor_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redor_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1 -; CHECK-NEXT: orr w0, w8, w2 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v2.4h, w1 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: orr v0.8b, v0.8b, v2.8b +; CHECK-NEXT: mov v1.h[1], w1 +; CHECK-NEXT: mov v1.h[2], w2 +; CHECK-NEXT: dup v1.2s, v1.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v3i8: @@ -263,13 +274,11 @@ ; CHECK-LABEL: test_redor_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i8: @@ -294,21 +303,13 @@ ; CHECK-LABEL: test_redor_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w11, w13, w12 -; CHECK-NEXT: orr w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w14 -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w15 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i8: @@ -344,23 +345,15 @@ define i8 @test_redor_v16i8(<16 x i8> %a) { ; CHECK-LABEL: test_redor_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w10, w10, w11 -; CHECK-NEXT: orr w11, w12, w13 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w10, w11, w14 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v16i8: @@ -398,23 +391,15 @@ ; CHECK-LABEL: test_redor_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w10, w10, w11 -; CHECK-NEXT: orr w11, w12, w13 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w10, w11, w14 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v32i8: @@ -453,13 +438,11 @@ ; CHECK-LABEL: test_redor_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i16: @@ -483,15 +466,13 @@ define i16 @test_redor_v8i16(<8 x i16> %a) { ; CHECK-LABEL: test_redor_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w9, w10, w11 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i16: @@ -517,15 +498,13 @@ ; CHECK-LABEL: test_redor_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w9, w10, w11 -; CHECK-NEXT: orr w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v16i16: @@ -552,9 +531,9 @@ ; CHECK-LABEL: test_redor_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v2i32: @@ -572,11 +551,11 @@ define i32 @test_redor_v4i32(<4 x i32> %a) { ; CHECK-LABEL: test_redor_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i32: @@ -596,11 +575,11 @@ ; CHECK-LABEL: test_redor_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i32: @@ -620,8 +599,8 @@ define i64 @test_redor_v2i64(<2 x i64> %a) { ; CHECK-LABEL: test_redor_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; @@ -640,8 +619,8 @@ ; CHECK-LABEL: test_redor_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; Index: llvm/test/CodeGen/AArch64/reduce-xor.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-xor.ll +++ llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -245,8 +245,16 @@ define i8 @test_redxor_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redxor_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor w8, w0, w1 -; CHECK-NEXT: eor w0, w8, w2 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: dup v2.4h, w1 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: eor v0.8b, v0.8b, v2.8b +; CHECK-NEXT: mov v1.h[1], w1 +; CHECK-NEXT: mov v1.h[2], w2 +; CHECK-NEXT: dup v1.2s, v1.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v3i8: @@ -262,13 +270,11 @@ ; CHECK-LABEL: test_redxor_v4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i8: @@ -293,21 +299,13 @@ ; CHECK-LABEL: test_redxor_v8i8: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w11, w13, w12 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: eor w8, w8, w14 -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w0, w8, w15 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v8i8: @@ -343,23 +341,15 @@ define i8 @test_redxor_v16i8(<16 x i8> %a) { ; CHECK-LABEL: test_redxor_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w10, w10, w11 -; CHECK-NEXT: eor w11, w12, w13 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w10, w11, w14 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v16i8: @@ -397,23 +387,15 @@ ; CHECK-LABEL: test_redxor_v32i8: ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w10, w10, w11 -; CHECK-NEXT: eor w11, w12, w13 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w10, w11, w14 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v32i8: @@ -452,13 +434,11 @@ ; CHECK-LABEL: test_redxor_v4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w0, w10, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i16: @@ -482,15 +462,13 @@ define i16 @test_redxor_v8i16(<8 x i16> %a) { ; CHECK-LABEL: test_redxor_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v8i16: @@ -516,15 +494,13 @@ ; CHECK-LABEL: test_redxor_v16i16: ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: eor w0, w8, w9 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v16i16: @@ -551,9 +527,9 @@ ; CHECK-LABEL: test_redxor_v2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v2i32: @@ -571,11 +547,11 @@ define i32 @test_redxor_v4i32(<4 x i32> %a) { ; CHECK-LABEL: test_redxor_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i32: @@ -595,11 +571,11 @@ ; CHECK-LABEL: test_redxor_v8i32: ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v8i32: @@ -619,8 +595,8 @@ define i64 @test_redxor_v2i64(<2 x i64> %a) { ; CHECK-LABEL: test_redxor_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; @@ -639,8 +615,8 @@ ; CHECK-LABEL: test_redxor_v4i64: ; CHECK: // %bb.0: ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret ; Index: llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll @@ -13,10 +13,14 @@ define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: andv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) ret i8 %res @@ -26,10 +30,15 @@ define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: andv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) ret i8 %res @@ -51,7 +60,7 @@ define i8 @andv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -102,10 +111,12 @@ define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: andv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) ret i16 %res @@ -115,10 +126,13 @@ define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: andv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a) ret i16 %res @@ -140,7 +154,7 @@ define i16 @andv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -191,9 +205,9 @@ define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) @@ -204,9 +218,10 @@ define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: andv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: andv s0, p0, z0.s +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) @@ -229,7 +244,7 @@ define i32 @andv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -316,7 +331,7 @@ define i64 @andv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -371,10 +386,14 @@ define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: eorv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) ret i8 %res @@ -384,10 +403,15 @@ define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: eorv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) ret i8 %res @@ -409,7 +433,7 @@ define i8 @eorv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -460,10 +484,12 @@ define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: eorv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) ret i16 %res @@ -473,10 +499,13 @@ define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: eorv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a) ret i16 %res @@ -498,7 +527,7 @@ define i16 @eorv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -549,9 +578,9 @@ define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) @@ -562,9 +591,10 @@ define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: eorv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: eorv s0, p0, z0.s +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) @@ -587,7 +617,7 @@ define i32 @eorv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -674,7 +704,7 @@ define i64 @eorv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -729,10 +759,14 @@ define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: orv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.8b, v0.b[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) ret i8 %res @@ -742,10 +776,15 @@ define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z0.b -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) ret i8 %res @@ -767,7 +806,7 @@ define i8 @orv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -818,10 +857,12 @@ define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: orv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) ret i16 %res @@ -831,10 +872,13 @@ define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: orv h0, p0, z0.h -; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a) ret i16 %res @@ -856,7 +900,7 @@ define i16 @orv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -907,9 +951,9 @@ define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) @@ -920,9 +964,10 @@ define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { ; CHECK-LABEL: orv_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: orv s0, p0, z0.s +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) @@ -945,7 +990,7 @@ define i32 @orv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1032,7 +1077,7 @@ define i64 @orv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] Index: llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll @@ -4,7 +4,7 @@ define i1 @ptest_v16i1_256bit_min_sve(ptr %a, ptr %b) vscale_range(2, 0) { ; CHECK-LABEL: ptest_v16i1_256bit_min_sve: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #8 +; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -17,8 +17,7 @@ ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z1.b +; CHECK-NEXT: umaxv b0, v1.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -35,10 +34,9 @@ ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -55,10 +53,9 @@ ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -78,10 +75,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -109,10 +105,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -135,10 +130,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z1.b +; CHECK-NEXT: umaxv b0, p0, z1.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -86,7 +86,7 @@ ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b ; CHECK-NEXT: orr z0.d, z1.d, z3.d ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -153,7 +153,7 @@ ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b ; CHECK-NEXT: and z0.d, z1.d, z3.d ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -85,9 +85,15 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: test_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w1 -; CHECK-NEXT: and w8, w8, w2 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: movi d0, #0xff00ff00ff00ff +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: dup v1.2s, v0.s[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: dup v1.4h, v0.h[1] +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: umov w0, v0.h[0] ; CHECK-NEXT: ret %b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a) ret i8 %b @@ -96,29 +102,23 @@ define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #-1 -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov v1.b[9], w8 -; CHECK-NEXT: mov v1.b[10], w8 -; CHECK-NEXT: mov v1.b[11], w8 -; CHECK-NEXT: mov v1.b[13], w8 -; CHECK-NEXT: umov w8, v0.b[4] -; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w9, v1.b[1] -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v1.b[2] -; CHECK-NEXT: umov w12, v1.b[3] -; CHECK-NEXT: umov w13, v1.b[5] -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: umov w10, v0.b[7] -; CHECK-NEXT: and w11, w11, w12 -; CHECK-NEXT: and w8, w8, w13 -; CHECK-NEXT: and w9, w9, w11 -; CHECK-NEXT: and w8, w8, w14 -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w0, w8, w10 +; CHECK-NEXT: mov w8, #-1 // =0xffffffff +; CHECK-NEXT: mov v0.b[9], w8 +; CHECK-NEXT: mov v0.b[10], w8 +; CHECK-NEXT: mov v0.b[11], w8 +; CHECK-NEXT: mov v0.b[12], w8 +; CHECK-NEXT: mov v0.b[13], w8 +; CHECK-NEXT: mov v0.b[14], w8 +; CHECK-NEXT: mov v0.b[15], w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.8h, v0.h[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, v0.b[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umov w0, v0.b[0] ; CHECK-NEXT: ret %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a) ret i8 %b @@ -127,11 +127,11 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind { ; CHECK-LABEL: test_v3i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: dup v2.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a) ret i32 %b @@ -140,6 +140,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind { ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: uminv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -151,11 +152,11 @@ define i24 @test_v4i24(<4 x i24> %a) nounwind { ; CHECK-LABEL: test_v4i24: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a) ret i24 %b @@ -177,11 +178,11 @@ ; CHECK-NEXT: and v1.16b, v1.16b, v3.16b ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: dup v1.2d, v0.d[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.4s, v0.s[1] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a) ret i32 %b Index: llvm/test/CodeGen/AArch64/vecreduce-bool.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -33,8 +33,10 @@ ; CHECK-LABEL: reduce_and_v2: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1 @@ -52,6 +54,7 @@ ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: uminv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1 @@ -127,8 +130,10 @@ ; CHECK-LABEL: reduce_or_v2: ; CHECK: // %bb.0: ; CHECK-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NEXT: cmlt v0.2s, v0.2s, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1 @@ -146,6 +151,7 @@ ; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: sshr v0.4h, v0.4h, #8 ; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: umaxv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1 Index: llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -138,6 +138,7 @@ define i1 @test_v4i1(<4 x i1> %a) nounwind { ; CHECK-LABEL: test_v4i1: ; CHECK: // %bb.0: +; CHECK-NEXT: bic v0.4h, #255, lsl #8 ; CHECK-NEXT: umaxv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1