Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1001,6 +1001,10 @@ setTargetDAGCombine(ISD::CTLZ); + setTargetDAGCombine(ISD::VECREDUCE_AND); + setTargetDAGCombine(ISD::VECREDUCE_OR); + setTargetDAGCombine(ISD::VECREDUCE_XOR); + // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = @@ -1165,8 +1169,14 @@ setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_AND, VT, Custom); + setOperationAction(ISD::VECREDUCE_OR, VT, Custom); + setOperationAction(ISD::VECREDUCE_XOR, VT, Custom); } setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom); + setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); @@ -13306,6 +13316,106 @@ DAG.getConstant(0, DL, MVT::i64)); } +static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, + SDLoc DL, SelectionDAG &DAG) { + unsigned ScalarOpcode; + switch (Opcode) { + case ISD::VECREDUCE_AND: + ScalarOpcode = ISD::AND; + break; + case ISD::VECREDUCE_OR: + ScalarOpcode = ISD::OR; + break; + case ISD::VECREDUCE_XOR: + ScalarOpcode = ISD::XOR; + break; + default: + llvm_unreachable("Expected bitwise vector reduction"); + return SDValue(); + } + + EVT VecVT = Vec.getValueType(); + assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() && + "Expected power-of-2 length vector"); + + EVT ElemVT = VecVT.getVectorElementType(); + + SDValue Result; + unsigned NumElems = VecVT.getVectorNumElements(); + + // Special case for boolean reductions + if (ElemVT == MVT::i1) { + // Split large vectors into smaller ones + if (NumElems > 16) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); + EVT HalfVT = Lo.getValueType(); + SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi); + return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG); + } + + // Vectors that are less than 64 bits get widened to neatly fit a 64 bit + // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to + // this element size leads to the best codegen, since e.g. setcc results + // might need to be truncated otherwise. + EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u)); + + // any_ext doesn't work with umin/umax, so only use it for uadd. + unsigned ExtendOp = + ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND; + SDValue Extended = DAG.getNode( + ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec); + switch (ScalarOpcode) { + case ISD::AND: + Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended); + break; + case ISD::OR: + Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended); + break; + case ISD::XOR: + Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended); + break; + default: + llvm_unreachable("Unexpected Opcode"); + } + + Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1); + } else { + // Iteratively split the vector in half and combine using the bitwise + // operation until it fits in a 64 bit register. + while (VecVT.getSizeInBits() > 64) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); + VecVT = Lo.getValueType(); + NumElems = VecVT.getVectorNumElements(); + Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi); + } + + EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits()); + + // Do the remaining work on a scalar since it allows the code generator to + // combine the shift and bitwise operation into one instruction and since + // integer instructions can have higher throughput than vector instructions. + SDValue Scalar = DAG.getBitcast(ScalarVT, Vec); + + // Iteratively combine the lower and upper halves of the scalar using the + // bitwise operation, halving the relevant region of the scalar in each + // iteration, until the relevant region is just one element of the original + // vector. + for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) { + SDValue ShiftAmount = + DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64); + SDValue Shifted = + DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount); + Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted); + } + + Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT); + } + + return DAG.getAnyExtOrTrunc(Result, DL, VT); +} + SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { SDValue Src = Op.getOperand(0); @@ -13357,6 +13467,11 @@ // Lower NEON reductions. SDLoc dl(Op); switch (Op.getOpcode()) { + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0), + Op.getValueType(), dl, DAG); case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: @@ -20847,6 +20962,22 @@ Op0ExtV, Op1ExtV, Op->getOperand(2)); } +static SDValue +performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + SDValue Vec = N->getOperand(0); + if (DCI.isBeforeLegalize() && + Vec.getValueType().getVectorElementType() == MVT::i1 && + Vec.getValueType().isFixedLengthVector() && + Vec.getValueType().isPow2VectorType()) { + SDLoc DL(N); + return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL, + DAG); + } + + return SDValue(); +} + static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -22015,6 +22146,10 @@ default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + return performVecReduceBitwiseCombine(N, DCI, DAG); case ISD::ADD: case ISD::SUB: return performAddSubCombine(N, DCI, DAG); Index: llvm/test/CodeGen/AArch64/dag-combine-setcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -193,10 +193,10 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uminv b0, v0.8b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <8 x i8> %a, zeroinitializer %cast = bitcast <8 x i1> %cmp1 to i8 @@ -208,10 +208,10 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <16 x i8> %a, zeroinitializer %cast = bitcast <16 x i1> %cmp1 to i16 @@ -223,12 +223,14 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v32i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 ; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <32 x i8> %a, zeroinitializer %cast = bitcast <32 x i1> %cmp1 to i32 @@ -240,16 +242,18 @@ ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v64i1: ; CHECK: // %bb.0: ; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b ; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 ; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 ; CHECK-NEXT: bic v1.16b, v1.16b, v3.16b ; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: uminv b0, v0.16b -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: tst w8, #0x1 -; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: bic w0, w8, w9 ; CHECK-NEXT: ret %cmp1 = icmp ne <64 x i8> %a, zeroinitializer %cast = bitcast <64 x i1> %cmp1 to i64 Index: llvm/test/CodeGen/AArch64/double_reduct.ll =================================================================== --- llvm/test/CodeGen/AArch64/double_reduct.ll +++ llvm/test/CodeGen/AArch64/double_reduct.ll @@ -131,9 +131,9 @@ ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.and.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) @@ -148,9 +148,9 @@ ; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.or.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) @@ -165,9 +165,9 @@ ; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret %r1 = call i32 @llvm.vector.reduce.xor.i32.v8i32(<8 x i32> %a) %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) Index: llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll =================================================================== --- llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -59,12 +59,14 @@ ; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v1.8h, v4.8h, v5.8h +; CHECK-NEXT: uzp1 v1.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: uzp1 v1.16b, v1.16b, v6.16b +; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-NEXT: mvn v0.16b, v0.16b ; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: bic w0, w9, w8 Index: llvm/test/CodeGen/AArch64/reduce-and.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-and.ll +++ llvm/test/CodeGen/AArch64/reduce-and.ll @@ -256,9 +256,14 @@ define i8 @test_redand_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redand_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w1 -; CHECK-NEXT: and w8, w8, w2 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: movi d0, #0xff00ff00ff00ff +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v3i8: @@ -273,14 +278,10 @@ define i8 @test_redand_v4i8(<4 x i8> %a) { ; CHECK-LABEL: test_redand_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i8: @@ -304,22 +305,11 @@ define i8 @test_redand_v8i8(<8 x i8> %a) { ; CHECK-LABEL: test_redand_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w11, w13, w12 -; CHECK-NEXT: and w9, w10, w11 -; CHECK-NEXT: and w8, w8, w14 -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w0, w8, w15 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: and x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i8: @@ -357,20 +347,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w10, w10, w11 -; CHECK-NEXT: and w11, w12, w13 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w10, w11, w14 -; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: and x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; @@ -411,20 +391,10 @@ ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: and w10, w10, w11 -; CHECK-NEXT: and w11, w12, w13 -; CHECK-NEXT: and w8, w8, w10 -; CHECK-NEXT: and w10, w11, w14 -; CHECK-NEXT: and w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: and x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; @@ -463,14 +433,10 @@ define i16 @test_redand_v4i16(<4 x i16> %a) { ; CHECK-LABEL: test_redand_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w10, w11, w10 -; CHECK-NEXT: and w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i16: @@ -496,12 +462,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; @@ -530,12 +493,9 @@ ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; @@ -562,10 +522,9 @@ define i32 @test_redand_v2i32(<2 x i32> %a) { ; CHECK-LABEL: test_redand_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v2i32: @@ -585,9 +544,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v4i32: @@ -609,9 +568,9 @@ ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redand_v8i32: Index: llvm/test/CodeGen/AArch64/reduce-or.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-or.ll +++ llvm/test/CodeGen/AArch64/reduce-or.ll @@ -256,8 +256,16 @@ define i8 @test_redor_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redor_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: orr w8, w0, w1 -; CHECK-NEXT: orr w0, w8, w2 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: lsr x10, x9, #32 +; CHECK-NEXT: lsr x9, x9, #16 +; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v3i8: @@ -272,14 +280,10 @@ define i8 @test_redor_v4i8(<4 x i8> %a) { ; CHECK-LABEL: test_redor_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i8: @@ -303,22 +307,11 @@ define i8 @test_redor_v8i8(<8 x i8> %a) { ; CHECK-LABEL: test_redor_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w11, w13, w12 -; CHECK-NEXT: orr w9, w10, w11 -; CHECK-NEXT: orr w8, w8, w14 -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w0, w8, w15 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: orr x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i8: @@ -356,20 +349,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w10, w10, w11 -; CHECK-NEXT: orr w11, w12, w13 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w10, w11, w14 -; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: orr x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; @@ -410,20 +393,10 @@ ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: orr w10, w10, w11 -; CHECK-NEXT: orr w11, w12, w13 -; CHECK-NEXT: orr w8, w8, w10 -; CHECK-NEXT: orr w10, w11, w14 -; CHECK-NEXT: orr w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: orr x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; @@ -462,14 +435,10 @@ define i16 @test_redor_v4i16(<4 x i16> %a) { ; CHECK-LABEL: test_redor_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w10, w11, w10 -; CHECK-NEXT: orr w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i16: @@ -495,12 +464,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; @@ -529,12 +495,9 @@ ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: orr w8, w9, w8 -; CHECK-NEXT: orr w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; @@ -561,10 +524,9 @@ define i32 @test_redor_v2i32(<2 x i32> %a) { ; CHECK-LABEL: test_redor_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v2i32: @@ -584,9 +546,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v4i32: @@ -608,9 +570,9 @@ ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: orr w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: orr w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redor_v8i32: Index: llvm/test/CodeGen/AArch64/reduce-xor.ll =================================================================== --- llvm/test/CodeGen/AArch64/reduce-xor.ll +++ llvm/test/CodeGen/AArch64/reduce-xor.ll @@ -245,8 +245,16 @@ define i8 @test_redxor_v3i8(<3 x i8> %a) { ; CHECK-LABEL: test_redxor_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: eor w8, w0, w1 -; CHECK-NEXT: eor w0, w8, w2 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: lsr x10, x9, #32 +; CHECK-NEXT: lsr x9, x9, #16 +; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v3i8: @@ -261,14 +269,10 @@ define i8 @test_redxor_v4i8(<4 x i8> %a) { ; CHECK-LABEL: test_redxor_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i8: @@ -292,22 +296,11 @@ define i8 @test_redxor_v8i8(<8 x i8> %a) { ; CHECK-LABEL: test_redxor_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.b[5] -; CHECK-NEXT: umov w9, v0.b[4] -; CHECK-NEXT: umov w10, v0.b[1] -; CHECK-NEXT: umov w11, v0.b[0] -; CHECK-NEXT: umov w12, v0.b[3] -; CHECK-NEXT: umov w13, v0.b[2] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: umov w15, v0.b[7] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w11, w13, w12 -; CHECK-NEXT: eor w9, w10, w11 -; CHECK-NEXT: eor w8, w8, w14 -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w0, w8, w15 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: eor x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v8i8: @@ -345,20 +338,10 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w10, w10, w11 -; CHECK-NEXT: eor w11, w12, w13 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w10, w11, w14 -; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: eor x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; @@ -399,20 +382,10 @@ ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.b[1] -; CHECK-NEXT: umov w9, v0.b[0] -; CHECK-NEXT: umov w10, v0.b[2] -; CHECK-NEXT: umov w11, v0.b[3] -; CHECK-NEXT: umov w12, v0.b[4] -; CHECK-NEXT: umov w13, v0.b[5] -; CHECK-NEXT: umov w14, v0.b[6] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: umov w9, v0.b[7] -; CHECK-NEXT: eor w10, w10, w11 -; CHECK-NEXT: eor w11, w12, w13 -; CHECK-NEXT: eor w8, w8, w10 -; CHECK-NEXT: eor w10, w11, w14 -; CHECK-NEXT: eor w8, w8, w10 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: eor x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; @@ -451,14 +424,10 @@ define i16 @test_redxor_v4i16(<4 x i16> %a) { ; CHECK-LABEL: test_redxor_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: umov w8, v0.h[3] -; CHECK-NEXT: umov w9, v0.h[2] -; CHECK-NEXT: umov w10, v0.h[1] -; CHECK-NEXT: umov w11, v0.h[0] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w10, w11, w10 -; CHECK-NEXT: eor w0, w10, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i16: @@ -484,12 +453,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; @@ -518,12 +484,9 @@ ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w8, v0.h[1] -; CHECK-NEXT: umov w9, v0.h[0] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: eor w8, w9, w8 -; CHECK-NEXT: eor w9, w10, w11 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: eor x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 ; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; @@ -550,10 +513,9 @@ define i32 @test_redxor_v2i32(<2 x i32> %a) { ; CHECK-LABEL: test_redxor_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v2i32: @@ -573,9 +535,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v4i32: @@ -597,9 +559,9 @@ ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: eor w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: eor w0, w8, w9 ; CHECK-NEXT: ret ; ; GISEL-LABEL: test_redxor_v8i32: Index: llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-log-reduce.ll @@ -51,7 +51,7 @@ define i8 @andv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -140,7 +140,7 @@ define i16 @andv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -229,7 +229,7 @@ define i32 @andv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -316,7 +316,7 @@ define i64 @andv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: andv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -409,7 +409,7 @@ define i8 @eorv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -498,7 +498,7 @@ define i16 @eorv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -587,7 +587,7 @@ define i32 @eorv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -674,7 +674,7 @@ define i64 @eorv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: eorv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] @@ -767,7 +767,7 @@ define i8 @orv_v64i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov w8, #32 +; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] @@ -856,7 +856,7 @@ define i16 @orv_v32i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #16 +; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] @@ -945,7 +945,7 @@ define i32 @orv_v16i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #8 +; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] @@ -1032,7 +1032,7 @@ define i64 @orv_v8i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: orv_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: mov x8, #4 +; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] Index: llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll +++ llvm/test/CodeGen/AArch64/sve-fixed-length-ptest.ll @@ -17,8 +17,7 @@ ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b ; CHECK-NEXT: mov v1.d[1], v0.d[0] -; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z1.b +; CHECK-NEXT: umaxv b0, v1.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -35,10 +34,9 @@ ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -55,10 +53,9 @@ ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -78,10 +75,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -109,10 +105,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -135,10 +130,9 @@ ; CHECK-NEXT: fcmne p0.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z1.b +; CHECK-NEXT: umaxv b0, p0, z1.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -86,7 +86,7 @@ ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b ; CHECK-NEXT: orr z0.d, z1.d, z3.d ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: orv b0, p0, z0.b +; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -153,7 +153,7 @@ ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b ; CHECK-NEXT: and z0.d, z1.d, z3.d ; CHECK-NEXT: ptrue p0.b, vl16 -; CHECK-NEXT: andv b0, p0, z0.b +; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -85,9 +85,14 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { ; CHECK-LABEL: test_v3i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and w8, w0, w1 -; CHECK-NEXT: and w8, w8, w2 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: movi d0, #0xff00ff00ff00ff +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: mov v0.h[1], w1 +; CHECK-NEXT: mov v0.h[2], w2 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: lsr x9, x8, #16 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %b = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %a) ret i8 %b @@ -97,28 +102,21 @@ ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: umov w14, v0.b[6] ; CHECK-NEXT: mov v1.16b, v0.16b ; CHECK-NEXT: mov v1.b[9], w8 ; CHECK-NEXT: mov v1.b[10], w8 ; CHECK-NEXT: mov v1.b[11], w8 +; CHECK-NEXT: mov v1.b[12], w8 ; CHECK-NEXT: mov v1.b[13], w8 -; CHECK-NEXT: umov w8, v0.b[4] +; CHECK-NEXT: mov v1.b[14], w8 +; CHECK-NEXT: mov v1.b[15], w8 ; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: and v1.8b, v0.8b, v1.8b -; CHECK-NEXT: umov w9, v1.b[1] -; CHECK-NEXT: umov w10, v1.b[0] -; CHECK-NEXT: umov w11, v1.b[2] -; CHECK-NEXT: umov w12, v1.b[3] -; CHECK-NEXT: umov w13, v1.b[5] -; CHECK-NEXT: and w9, w10, w9 -; CHECK-NEXT: umov w10, v0.b[7] -; CHECK-NEXT: and w11, w11, w12 -; CHECK-NEXT: and w8, w8, w13 -; CHECK-NEXT: and w9, w9, w11 -; CHECK-NEXT: and w8, w8, w14 -; CHECK-NEXT: and w8, w9, w8 -; CHECK-NEXT: and w0, w8, w10 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: and x8, x8, x8, lsr #16 +; CHECK-NEXT: lsr x9, x8, #8 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %b = call i8 @llvm.vector.reduce.and.v9i8(<9 x i8> %a) ret i8 %b @@ -128,9 +126,10 @@ ; CHECK-LABEL: test_v3i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x8, x8, #32 +; CHECK-NEXT: and v1.8b, v0.8b, v1.8b +; CHECK-NEXT: fmov x9, d1 ; CHECK-NEXT: and w0, w9, w8 ; CHECK-NEXT: ret %b = call i32 @llvm.vector.reduce.and.v3i32(<3 x i32> %a) @@ -155,9 +154,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %b = call i24 @llvm.vector.reduce.and.v4i24(<4 x i24> %a) ret i24 %b @@ -181,9 +180,9 @@ ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: mov w8, v0.s[1] -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: and w0, w9, w8 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: lsr x9, x8, #32 +; CHECK-NEXT: and w0, w8, w9 ; CHECK-NEXT: ret %b = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %a) ret i32 %b