diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1007,6 +1007,14 @@ setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); } } + + for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, + MVT::v4i32}) { + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } } if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) { @@ -10271,6 +10279,80 @@ return LowerVecReduce(Op, DAG, ST); } +static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (!ST->hasNEON()) + return SDValue(); + + SDLoc dl(Op); + SDValue Op0 = Op->getOperand(0); + EVT VT = Op0.getValueType(); + EVT EltVT = VT.getVectorElementType(); + + unsigned PairwiseIntrinsic = 0; + switch (Op->getOpcode()) { + default: + llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_UMIN: + PairwiseIntrinsic = Intrinsic::arm_neon_vpminu; + break; + case ISD::VECREDUCE_UMAX: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu; + break; + case ISD::VECREDUCE_SMIN: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmins; + break; + case ISD::VECREDUCE_SMAX: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs; + break; + } + SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32); + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumActiveLanes = NumElts; + + assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 || + NumActiveLanes == 2) && + "Only expected a power 2 vector size"); + + // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors. + if (VT.is128BitVector()) { + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl); + VT = Lo.getValueType(); + Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi}); + NumActiveLanes /= 2; + } + + // Use pairwise reductions until one lane remains + while (NumActiveLanes > 1) { + Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0}); + NumActiveLanes /= 2; + } + + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0, dl, MVT::i32)); + + // Result type may be wider than element type. + if (EltVT != Op.getValueType()) { + unsigned Extend = 0; + switch (Op->getOpcode()) { + default: + llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_UMAX: + Extend = ISD::ZERO_EXTEND; + break; + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_SMAX: + Extend = ISD::SIGN_EXTEND; + break; + } + Res = DAG.getNode(Extend, dl, Op->getValueType(0), Res); + } + return Res; +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getSuccessOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or @@ -10502,6 +10584,11 @@ case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMAX: return LowerVecReduceF(Op, DAG, Subtarget); + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_SMAX: + return LowerVecReduceMinMax(Op, DAG, Subtarget); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); diff --git a/llvm/test/CodeGen/ARM/vecreduce-minmax.ll b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll @@ -0,0 +1,241 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=armv7-none-eabi -mattr=+neon -verify-machineinstrs | FileCheck %s + +define i8 @test_umin_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_umin_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_smin_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_smin_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vmov.s8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_umax_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_umax_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_smax_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_smax_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vmov.s8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i16 @test_umin_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_umin_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u16 d16, d16, d16 +; CHECK-NEXT: vpmin.u16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_smin_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_smin_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vmov.s16 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_umax_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_umax_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u16 d16, d16, d16 +; CHECK-NEXT: vpmax.u16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_smax_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_smax_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s16 d16, d16, d16 +; CHECK-NEXT: vpmax.s16 d16, d16, d16 +; CHECK-NEXT: vmov.s16 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i32 @test_umin_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_umin_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_smin_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_smin_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_umax_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_umax_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_smax_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_smax_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i8 @test_umin_v16i8(<16 x i8> %x) { +; CHECK-LABEL: test_umin_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vpmin.u8 d16, d17, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %x) + ret i8 %z +} + +define i16 @test_smin_v8i16(<8 x i16> %x) { +; CHECK-LABEL: test_smin_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vpmin.s16 d16, d17, d16 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vmov.s16 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %x) + ret i16 %z +} + +define i32 @test_umax_v4i32(<4 x i32> %x) { +; CHECK-LABEL: test_umax_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vpmax.u32 d16, d17, d16 +; CHECK-NEXT: vpmax.u32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %x) + ret i32 %z +} + +define i8 @test_umin_v32i8(<32 x i8> %x) { +; CHECK-LABEL: test_umin_v32i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.u8 q8, q8, q9 +; CHECK-NEXT: vpmin.u8 d16, d16, d17 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: bx lr +entry: + %z = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %x) + ret i8 %z +} + +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) + +declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) +declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) + +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)