diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10863,6 +10863,39 @@ return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); } + // Handle NEON vector reduction instructions that implicitly zero the high + // lanes. If we are using a BUILD VECTOR to explicitly zero the high lanes, + // then we can propagate the BUILD_VECTOR away entirely. + // (build_vector (extract_vector_elt (umax ...), 0), 0...) -> (umax ...) + if (usesOnlyOneConstantValue == true && NumConstantLanes == NumElts - 1) { + SDValue Op0 = Op.getOperand(0); + bool constantLanesAreZero = false; + if (ConstantSDNode *cst = + dyn_cast(ConstantValue.getNode())) { + constantLanesAreZero = cst->isZero(); + } else if (ConstantFPSDNode *cst = + dyn_cast(ConstantValue.getNode())) { + constantLanesAreZero = cst->isExactlyValue(0.0); + } + if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && constantLanesAreZero) { + SDValue Op0_0 = Op0.getOperand(0); + SDValue Op0_1 = Op0.getOperand(1); + ConstantSDNode *ConstantOp0_1 = dyn_cast(Op0_1.getNode()); + if (ConstantOp0_1 && ConstantOp0_1->isZero() && + (Op0_0.getOpcode() == AArch64ISD::UMAXV || + Op0_0.getOpcode() == AArch64ISD::UMINV || + Op0_0.getOpcode() == AArch64ISD::SMAXV || + Op0_0.getOpcode() == AArch64ISD::SMINV || + Op0_0.getOpcode() == AArch64ISD::UADDV || + Op0_0.getOpcode() == AArch64ISD::SADDV)) { + // NOTE: It would be nice to handle FMAXNM/FMINNM here as well, but + // they are currently modeled as intrinsics that return scalars, + // which prevents this pattern from being matchable. + return Op0_0; + } + } + } + if (AllLanesExtractElt) { SDNode *Vector = nullptr; bool Even = false; diff --git a/llvm/test/CodeGen/AArch64/vecreduce-zeroing.ll b/llvm/test/CodeGen/AArch64/vecreduce-zeroing.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vecreduce-zeroing.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub +; RUN: llc < %s | FileCheck %s +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local noundef <4 x i32> @umaxv(<4 x i32> noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: umaxv: +; CHECK: // %bb.0: +; CHECK-NEXT: umaxv s0, v0.4s +; CHECK-NEXT: ret + %2 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %0) + %3 = insertelement <4 x i32> , i32 %2, i64 0 + ret <4 x i32> %3 +} + +define dso_local noundef <4 x i32> @uminv(<4 x i32> noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: uminv: +; CHECK: // %bb.0: +; CHECK-NEXT: uminv s0, v0.4s +; CHECK-NEXT: ret + %2 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %0) + %3 = insertelement <4 x i32> , i32 %2, i64 0 + ret <4 x i32> %3 +} + +define dso_local noundef <4 x i32> @smaxv(<4 x i32> noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: smaxv: +; CHECK: // %bb.0: +; CHECK-NEXT: smaxv s0, v0.4s +; CHECK-NEXT: ret + %2 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %0) + %3 = insertelement <4 x i32> , i32 %2, i64 0 + ret <4 x i32> %3 +} + +define dso_local noundef <4 x i32> @sminv(<4 x i32> noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: sminv: +; CHECK: // %bb.0: +; CHECK-NEXT: sminv s0, v0.4s +; CHECK-NEXT: ret + %2 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %0) + %3 = insertelement <4 x i32> , i32 %2, i64 0 + ret <4 x i32> %3 +} + +define dso_local noundef <4 x i32> @addv(<4 x i32> noundef %0) local_unnamed_addr #0 { +; CHECK-LABEL: addv: +; CHECK: // %bb.0: +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: ret + %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %0) + %3 = insertelement <4 x i32> , i32 %2, i64 0 + ret <4 x i32> %3 +} + +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) #1 +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) #1 + +attributes #0 = { mustprogress nofree nosync nounwind readnone willreturn uwtable "frame-pointer"="non-leaf" "min-legal-vector-width"="128" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+crypto,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.2a" } +attributes #1 = { nofree nosync nounwind readnone willreturn } +