diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -20700,8 +20700,9 @@ SmallVector MaskConstants; if (VecVT == MVT::v16i8) { - // v16i8 is a special case, as we need to split it into two halves and - // combine, perform the mask+addition twice, and then combine them. + // v16i8 is a special case, as we have 16 entries but only 8 positional bits + // per entry. We split it into two halves, apply the mask, zip the halves to + // create 8x 16-bit values, and the perform the vector reduce. for (unsigned Half = 0; Half < 2; ++Half) { for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) { MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32)); @@ -20711,25 +20712,13 @@ SDValue RepresentativeBits = DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask); - EVT HalfVT = VecVT.getHalfNumVectorElementsVT(*DAG.getContext()); - unsigned NumElementsInHalf = HalfVT.getVectorNumElements(); - - SDValue LowHalf = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, RepresentativeBits, - DAG.getConstant(0, DL, MVT::i64)); - SDValue HighHalf = - DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, RepresentativeBits, - DAG.getConstant(NumElementsInHalf, DL, MVT::i64)); - - SDValue ReducedLowBits = - DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, LowHalf); - SDValue ReducedHighBits = - DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, HighHalf); - - SDValue ShiftedHighBits = - DAG.getNode(ISD::SHL, DL, MVT::i16, ReducedHighBits, - DAG.getConstant(NumElementsInHalf, DL, MVT::i32)); - return DAG.getNode(ISD::OR, DL, MVT::i16, ShiftedHighBits, ReducedLowBits); + SDValue UpperRepresentativeBits = + DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits, + RepresentativeBits, DAG.getConstant(8, DL, MVT::i32)); + SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT, + RepresentativeBits, UpperRepresentativeBits); + Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped); + return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped); } // All other vector sizes. diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -34,11 +34,9 @@ ; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] ; CHECK-NEXT: bic.16b v0, v1, v0 ; CHECK-NEXT: ext.16b v1, v0, v0, #8 -; CHECK-NEXT: addv.8b b0, v0 -; CHECK-NEXT: addv.8b b1, v1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: orr w0, w9, w8, lsl #8 +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer @@ -324,11 +322,9 @@ ; CHECK-NEXT: ldr q1, [x8, lCPI10_0@PAGEOFF] ; CHECK-NEXT: and.16b v0, v0, v1 ; CHECK-NEXT: ext.16b v1, v0, v0, #8 -; CHECK-NEXT: addv.8b b0, v0 -; CHECK-NEXT: addv.8b b1, v1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: orr w0, w9, w8, lsl #8 +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret %bitmask = bitcast <16 x i1> %vec to i16 diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -30,12 +30,9 @@ ; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] ; CHECK-NEXT: bic.16b v0, v1, v0 ; CHECK-NEXT: ext.16b v1, v0, v0, #8 -; CHECK-NEXT: addv.8b b0, v0 -; CHECK-NEXT: addv.8b b1, v1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: orr w8, w9, w8, lsl #8 -; CHECK-NEXT: strh w8, [x0] +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: str h0, [x0] ; CHECK-NEXT: ret %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer