Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13211,9 +13211,17 @@ EVT ElemVT = VecVT.getVectorElementType(); SDValue Result; + unsigned NumElems = VecVT.getVectorNumElements(); // special case for boolean reductions if (ElemVT == MVT::i1) { + // split large vectors into smaller ones + if (NumElems > 16) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL); + return getVectorBitwiseReduce(Opcode, DAG.getNode(ScalarOpcode, DL, Lo.getValueType(), Lo, Hi), VT, DL, DAG); + } + // Casting to i8 first leads to better codegen. SDValue Extended = DAG.getAnyExtOrTrunc(Vec, DL, VecVT.changeVectorElementType(MVT::i8)); @@ -13233,7 +13241,6 @@ Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1); } else { - unsigned NumElems = VecVT.getVectorNumElements(); SmallVector ShiftValues(NumElems, -1); SDValue Accumulator = Vec; Index: llvm/test/CodeGen/AArch64/dag-combine-setcc.ll =================================================================== --- llvm/test/CodeGen/AArch64/dag-combine-setcc.ll +++ llvm/test/CodeGen/AArch64/dag-combine-setcc.ll @@ -37,7 +37,7 @@ ; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 @@ -51,14 +51,14 @@ define i1 @combine_setcc_eq_vecreduce_or_v64i1(<64 x i8> %a) { ; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 +; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 ; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 ; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-NEXT: umax v1.16b, v1.16b, v3.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 @@ -100,9 +100,8 @@ define i1 @combine_setcc_ne_vecreduce_or_v32i1(<32 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_or_v32i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -116,13 +115,10 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_or_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmtst v3.16b, v3.16b, v3.16b -; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b -; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-NEXT: umax v1.16b, v1.16b, v3.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -164,9 +160,8 @@ define i1 @combine_setcc_eq_vecreduce_and_v32i1(<32 x i8> %a) { ; CHECK-LABEL: combine_setcc_eq_vecreduce_and_v32i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -180,13 +175,10 @@ define i1 @combine_setcc_eq_vecreduce_and_v64i1(<64 x i8> %a) { ; CHECK-LABEL: combine_setcc_eq_vecreduce_and_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 -; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 -; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 +; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 -; CHECK-NEXT: umin v1.16b, v1.16b, v3.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 @@ -230,10 +222,10 @@ define i1 @combine_setcc_ne_vecreduce_and_v32i1(<32 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v32i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b -; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b +; CHECK-NEXT: mov w8, #1 // =0x1 +; CHECK-NEXT: cmeq v1.16b, v1.16b, #0 +; CHECK-NEXT: bic v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 @@ -247,14 +239,14 @@ define i1 @combine_setcc_ne_vecreduce_and_v64i1(<64 x i8> %a) { ; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v64i1: ; CHECK: // %bb.0: -; CHECK-NEXT: cmtst v3.16b, v3.16b, v3.16b -; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmtst v1.16b, v1.16b, v1.16b -; CHECK-NEXT: cmtst v2.16b, v2.16b, v2.16b +; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmtst v0.16b, v0.16b, v0.16b -; CHECK-NEXT: umin v1.16b, v1.16b, v3.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b -; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b +; CHECK-NEXT: cmeq v3.16b, v3.16b, #0 +; CHECK-NEXT: cmeq v2.16b, v2.16b, #0 +; CHECK-NEXT: bic v1.16b, v1.16b, v3.16b +; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: bic w0, w8, w9 Index: llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll =================================================================== --- llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll +++ llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll @@ -48,24 +48,23 @@ define i1 @unordered_floating_point_compare_on_v32f32(<32 x float> %a_vec) { ; CHECK-LABEL: unordered_floating_point_compare_on_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: fcmgt v7.4s, v7.4s, #0.0 -; CHECK-NEXT: mov w9, #1 // =0x1 -; CHECK-NEXT: fcmgt v6.4s, v6.4s, #0.0 -; CHECK-NEXT: fcmgt v5.4s, v5.4s, #0.0 -; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: fcmgt v3.4s, v3.4s, #0.0 +; CHECK-NEXT: mov w9, #1 // =0x1 ; CHECK-NEXT: fcmgt v2.4s, v2.4s, #0.0 ; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0 ; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0 -; CHECK-NEXT: uzp1 v6.8h, v6.8h, v7.8h -; CHECK-NEXT: uzp1 v4.8h, v4.8h, v5.8h +; CHECK-NEXT: fcmgt v7.4s, v7.4s, #0.0 +; CHECK-NEXT: fcmgt v6.4s, v6.4s, #0.0 +; CHECK-NEXT: fcmgt v5.4s, v5.4s, #0.0 +; CHECK-NEXT: fcmgt v4.4s, v4.4s, #0.0 ; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-NEXT: uzp1 v1.16b, v4.16b, v6.16b +; CHECK-NEXT: uzp1 v1.8h, v6.8h, v7.8h +; CHECK-NEXT: uzp1 v3.8h, v4.8h, v5.8h ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b -; CHECK-NEXT: mvn v1.16b, v1.16b +; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b ; CHECK-NEXT: mvn v0.16b, v0.16b -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b +; CHECK-NEXT: orn v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: bic w0, w9, w8 Index: llvm/test/CodeGen/AArch64/vecreduce-bool.ll =================================================================== --- llvm/test/CodeGen/AArch64/vecreduce-bool.ll +++ llvm/test/CodeGen/AArch64/vecreduce-bool.ll @@ -99,9 +99,8 @@ define i32 @reduce_and_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_and_v32: ; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b ; CHECK-NEXT: uminv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1 @@ -197,9 +196,8 @@ define i32 @reduce_or_v32(<32 x i8> %a0, i32 %a1, i32 %a2) nounwind { ; CHECK-LABEL: reduce_or_v32: ; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: umax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: tst w8, #0x1