Index: llvm/include/llvm/Target/TargetSelectionDAG.td =================================================================== --- llvm/include/llvm/Target/TargetSelectionDAG.td +++ llvm/include/llvm/Target/TargetSelectionDAG.td @@ -239,6 +239,9 @@ def SDTVecInsert : SDTypeProfile<1, 3, [ // vector insert SDTCisEltOfVec<2, 1>, SDTCisSameAs<0, 1>, SDTCisPtrTy<3> ]>; +def SDTVecReduce : SDTypeProfile<1, 1, [ // vector reduction + SDTCisInt<0>, SDTCisVec<1> +]>; def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract SDTCisSubVecOfVec<0,1>, SDTCisInt<2> @@ -415,6 +418,8 @@ def extractelt : SDNode<"ISD::EXTRACT_VECTOR_ELT", SDTVecExtract>; def insertelt : SDNode<"ISD::INSERT_VECTOR_ELT", SDTVecInsert>; +def vecreduce_add : SDNode<"ISD::VECREDUCE_ADD", SDTVecReduce>; + def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; def fmul : SDNode<"ISD::FMUL" , SDTFPBinOp, [SDNPCommutative]>; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -267,6 +267,9 @@ setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); + // Vector reductions + setOperationAction(ISD::VECREDUCE_ADD, VT, Legal); + if (!HasMVEFP) { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -549,6 +549,12 @@ defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>; + def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>; +} + class MVE_VADDLV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -156,6 +156,13 @@ int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1004,3 +1004,28 @@ if (Cost < 12) UP.Force = true; } + +bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa(Ty) && "Expected Ty to be a vector type"); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + if (!ST->hasMVEIntegerOps()) + return false; + + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + case Instruction::ICmp: + case Instruction::FCmp: + return false; + case Instruction::Add: + return ScalarBits * Ty->getVectorNumElements() == 128; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} Index: llvm/test/CodeGen/Thumb2/mve-vaddv.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) +define arm_aapcs_vfpcc i32 @vaddv_v4i32_i32(<4 x i32> %s1) { +; CHECK-LABEL: vaddv_v4i32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %s1) + ret i32 %r +} + +declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) +define arm_aapcs_vfpcc i16 @vaddv_v16i16_i16(<8 x i16> %s1) { +; CHECK-LABEL: vaddv_v16i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %s1) + ret i16 %r +} + +declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) +define arm_aapcs_vfpcc i8 @vaddv_v16i8_i8(<16 x i8> %s1) { +; CHECK-LABEL: vaddv_v16i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %s1) + ret i8 %r +}