diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22516,6 +22516,19 @@ return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0); } + // vecreduce_or(insert_subvector(zero, val)) -> vecreduce_or(val) + // vecreduce_or(insert_subvector(undef, val)) -> vecreduce_or(val) + if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && + TLI.isTypeLegal(N0.getOperand(1).getValueType())) { + SDValue Vec = N0.getOperand(0); + SDValue Subvec = N0.getOperand(1); + if ((Opcode == ISD::VECREDUCE_OR && + (N0.getOperand(0).isUndef() || isNullOrNullSplat(Vec))) || + (Opcode == ISD::VECREDUCE_AND && + (N0.getOperand(0).isUndef() || isAllOnesOrAllOnesSplat(Vec)))) + return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), Subvec); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll @@ -0,0 +1,98 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s + +; +; OR reductions +; + +define i1 @reduce_or_insert_subvec_into_zero( %in) { +; CHECK-LABEL: reduce_or_insert_subvec_into_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( zeroinitializer, %in, i64 0) + %res = call i1 @llvm.vector.reduce.or.nxv16i1( %t) + ret i1 %res +} + +define i1 @reduce_or_insert_subvec_into_poison( %in) { +; CHECK-LABEL: reduce_or_insert_subvec_into_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( poison, %in, i64 0) + %res = call i1 @llvm.vector.reduce.or.nxv16i1( %t) + ret i1 %res +} + +define i1 @reduce_or_insert_subvec_into_nonzero( %in, %vec) { +; CHECK-LABEL: reduce_or_insert_subvec_into_nonzero: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: uzp1 p0.h, p0.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: ptest p0, p0.b +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( %vec, %in, i64 0) + %res = call i1 @llvm.vector.reduce.or.nxv16i1( %t) + ret i1 %res +} + +; +; AND reductions +; + +define i1 @reduce_and_insert_subvec_into_zero( %in) { +; CHECK-LABEL: reduce_and_insert_subvec_into_zero: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %allones.ins = insertelement poison, i1 1, i32 0 + %allones = shufflevector %allones.ins, poison, zeroinitializer + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( %allones, %in, i64 0) + %res = call i1 @llvm.vector.reduce.and.nxv16i1( %t) + ret i1 %res +} + +define i1 @reduce_and_insert_subvec_into_poison( %in) { +; CHECK-LABEL: reduce_and_insert_subvec_into_poison: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: not p0.b, p1/z, p0.b +; CHECK-NEXT: ptest p1, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( poison, %in, i64 0) + %res = call i1 @llvm.vector.reduce.and.nxv16i1( %t) + ret i1 %res +} + +define i1 @reduce_and_insert_subvec_into_nonzero( %in, %vec) { +; CHECK-LABEL: reduce_and_insert_subvec_into_nonzero: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p3.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: not p0.b, p2/z, p0.b +; CHECK-NEXT: ptest p2, p0.b +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret + %t = call @llvm.experimental.vector.insert.nxv16i1.nxv4i1( %vec, %in, i64 0) + %res = call i1 @llvm.vector.reduce.and.nxv16i1( %t) + ret i1 %res +} + +declare i1 @llvm.vector.reduce.and.nxv16i1() +declare i1 @llvm.vector.reduce.or.nxv16i1() +declare @llvm.experimental.vector.insert.nxv16i1.nxv4i1(, , i64)