diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -980,6 +980,7 @@ MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + void addNEONVectorTypes(); void addMVEVectorTypes(bool HasMVEFP); void addAllExtLoads(const MVT From, const MVT To, LegalizeAction Action); void setAllExpand(MVT VT); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -244,6 +244,17 @@ setLoadExtAction(ISD::SEXTLOAD, From, To, Action); } +void ARMTargetLowering::addNEONVectorTypes() { + const MVT IntTypes[] = {MVT::v8i8, MVT::v4i16, MVT::v2i32}; + + for (auto VT : IntTypes) { + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } +} + void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 }; @@ -818,6 +829,9 @@ setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasNEON()) + addNEONVectorTypes(); + if (Subtarget->hasMVEIntegerOps()) addMVEVectorTypes(Subtarget->hasMVEFloatOps()); @@ -10271,6 +10285,57 @@ return LowerVecReduce(Op, DAG, ST); } +static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, + const ARMSubtarget *ST) { + if (ST->hasMVEIntegerOps() || !ST->hasNEON()) + return SDValue(); + + SDLoc dl(Op); + SDValue Op0 = Op->getOperand(0); + EVT VT = Op0.getValueType(); + EVT EltVT = VT.getVectorElementType(); + + unsigned PairwiseIntrinsic = 0; + switch (Op->getOpcode()) { + default: + llvm_unreachable("Expected VECREDUCE opcode"); + case ISD::VECREDUCE_UMIN: + PairwiseIntrinsic = Intrinsic::arm_neon_vpminu; + break; + case ISD::VECREDUCE_UMAX: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu; + break; + case ISD::VECREDUCE_SMIN: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmins; + break; + case ISD::VECREDUCE_SMAX: + PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs; + break; + } + + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumActiveLanes = NumElts; + + assert((NumActiveLanes == 8 || NumActiveLanes == 4 || NumActiveLanes == 2) && + "Only expected a power 2 vector size"); + + // use pairwise reductions until one lane remains + while (NumActiveLanes > 1) { + SDValue Ops[] = {DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32), Op0, + Op0}; + Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); + NumActiveLanes /= 2; + } + + SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0, + DAG.getConstant(0, dl, MVT::i32)); + + // Result type may be wider than element type. + if (EltVT != Op->getValueType(0)) + Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res); + return Res; +} + static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getSuccessOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or diff --git a/llvm/test/CodeGen/ARM/vecreduce-minmax.ll b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vecreduce-minmax.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+neon -verify-machineinstrs | FileCheck %s --check-prefix=CHECK + +define i8 @test_umin_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_umin_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vpmin.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_smin_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_smin_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vpmin.s8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_umax_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_umax_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vpmax.u8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i8 @test_smax_v8i8(<8 x i8> %x) { +; CHECK-LABEL: test_smax_v8i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vpmax.s8 d16, d16, d16 +; CHECK-NEXT: vmov.u8 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %x) + ret i8 %z +} + +define i16 @test_umin_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_umin_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u16 d16, d16, d16 +; CHECK-NEXT: vpmin.u16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_smin_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_smin_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vpmin.s16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_umax_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_umax_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u16 d16, d16, d16 +; CHECK-NEXT: vpmax.u16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i16 @test_smax_v4i16(<4 x i16> %x) { +; CHECK-LABEL: test_smax_v4i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s16 d16, d16, d16 +; CHECK-NEXT: vpmax.s16 d16, d16, d16 +; CHECK-NEXT: vmov.u16 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %x) + ret i16 %z +} + +define i32 @test_umin_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_umin_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.u32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_smin_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_smin_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmin.s32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_umax_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_umax_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.u32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i32 @test_smax_v2i32(<2 x i32> %x) { +; CHECK-LABEL: test_smax_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vpmax.s32 d16, d16, d16 +; CHECK-NEXT: vmov.32 r0, d16[0] +; CHECK-NEXT: mov pc, lr +entry: + %z = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %x) + ret i32 %z +} + +define i8 @test_umin_v32i8(<32 x i8> %x) { +; CHECK-LABEL: test_umin_v32i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r11, lr} +; CHECK-NEXT: push {r4, r5, r11, lr} +; CHECK-NEXT: add r12, sp, #16 +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.u8 q8, q8, q9 +; CHECK-NEXT: vmin.u8 d16, d16, d17 +; CHECK-NEXT: vmov.u8 r0, d16[1] +; CHECK-NEXT: vmov.u8 r1, d16[0] +; CHECK-NEXT: vmov.u8 r2, d16[2] +; CHECK-NEXT: vmov.u8 r3, d16[3] +; CHECK-NEXT: vmov.u8 r12, d16[4] +; CHECK-NEXT: vmov.u8 lr, d16[5] +; CHECK-NEXT: vmov.u8 r4, d16[6] +; CHECK-NEXT: vmov.u8 r5, d16[7] +; CHECK-NEXT: cmp r1, r0 +; CHECK-NEXT: movlo r0, r1 +; CHECK-NEXT: cmp r0, r2 +; CHECK-NEXT: movhs r0, r2 +; CHECK-NEXT: cmp r0, r3 +; CHECK-NEXT: movhs r0, r3 +; CHECK-NEXT: cmp r0, r12 +; CHECK-NEXT: movhs r0, r12 +; CHECK-NEXT: cmp r0, lr +; CHECK-NEXT: movhs r0, lr +; CHECK-NEXT: cmp r0, r4 +; CHECK-NEXT: movhs r0, r4 +; CHECK-NEXT: cmp r0, r5 +; CHECK-NEXT: movhs r0, r5 +; CHECK-NEXT: pop {r4, r5, r11, lr} +; CHECK-NEXT: mov pc, lr +entry: + %z = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %x) + ret i8 %z +} + +declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) +declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) +declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) +declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>)