Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -902,12 +902,6 @@ llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_SVE_FP_Reduce_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], - [LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, - llvm_anyvector_ty], - [IntrNoMem]>; - class AdvSIMD_SVE_ReduceWithInit_Intrinsic : Intrinsic<[LLVMVectorElementType<0>], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -915,13 +909,6 @@ llvm_anyvector_ty], [IntrNoMem]>; - class AdvSIMD_SVE_FP_ReduceWithInit_Intrinsic - : Intrinsic<[llvm_anyfloat_ty], - [LLVMScalarOrSameVectorWidth<1, llvm_i1_ty>, - LLVMMatchType<0>, - llvm_anyvector_ty], - [IntrNoMem]>; - class AdvSIMD_SVE_ShiftByImm_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -1687,12 +1674,12 @@ // Floating-point reductions // -def int_aarch64_sve_fadda : AdvSIMD_SVE_FP_ReduceWithInit_Intrinsic; -def int_aarch64_sve_faddv : AdvSIMD_SVE_FP_Reduce_Intrinsic; -def int_aarch64_sve_fmaxv : AdvSIMD_SVE_FP_Reduce_Intrinsic; -def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_FP_Reduce_Intrinsic; -def int_aarch64_sve_fminv : AdvSIMD_SVE_FP_Reduce_Intrinsic; -def int_aarch64_sve_fminnmv : AdvSIMD_SVE_FP_Reduce_Intrinsic; +def int_aarch64_sve_fadda : AdvSIMD_SVE_ReduceWithInit_Intrinsic; +def int_aarch64_sve_faddv : AdvSIMD_SVE_Reduce_Intrinsic; +def int_aarch64_sve_fmaxv : AdvSIMD_SVE_Reduce_Intrinsic; +def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_Reduce_Intrinsic; +def int_aarch64_sve_fminv : AdvSIMD_SVE_Reduce_Intrinsic; +def int_aarch64_sve_fminnmv : AdvSIMD_SVE_Reduce_Intrinsic; // // Floating-point conversions Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,6 +211,14 @@ REV, TBL, + // Floating-point reductions. + FADDA_PRED, + FADDV_PRED, + FMAXV_PRED, + FMAXNMV_PRED, + FMINV_PRED, + FMINNMV_PRED, + INSR, PTEST, PTRUE, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -890,6 +890,8 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); for (MVT VT : MVT::fp_scalable_vector_valuetypes()) { + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + if (isTypeLegal(VT)) { setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); } @@ -1361,6 +1363,12 @@ case AArch64ISD::REV: return "AArch64ISD::REV"; case AArch64ISD::REINTERPRET_CAST: return "AArch64ISD::REINTERPRET_CAST"; case AArch64ISD::TBL: return "AArch64ISD::TBL"; + case AArch64ISD::FADDA_PRED: return "AArch64ISD::FADDA_PRED"; + case AArch64ISD::FADDV_PRED: return "AArch64ISD::FADDV_PRED"; + case AArch64ISD::FMAXV_PRED: return "AArch64ISD::FMAXV_PRED"; + case AArch64ISD::FMAXNMV_PRED: return "AArch64ISD::FMAXNMV_PRED"; + case AArch64ISD::FMINV_PRED: return "AArch64ISD::FMINV_PRED"; + case AArch64ISD::FMINNMV_PRED: return "AArch64ISD::FMINNMV_PRED"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; @@ -8393,13 +8401,50 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); + SDLoc DL(Op); - // Check for non-constant or out of range lane. + // Check for out of range lane. EVT VT = Op.getOperand(0).getValueType(); - ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); - if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) + auto *CIdx = dyn_cast(Op.getOperand(1)); + + if (VT.isScalableVector()) { + if (CIdx) { + // Leave type legalisation to common code. + if (!isTypeLegal(VT)) + return SDValue(); + + // Ignore extracts from unpacked vectors. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return Op; + + // Ignore extracts whose index is beyond the range of NEON. + if (CIdx->getZExtValue() >= VT.getVectorElementCount().Min) + return Op; + + // ValueType for NEON part of the SVE input. + EVT SubVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements()); + assert(isTypeLegal(SubVT) && "Unexpected Subtype during extract!"); + + // The requested element is within the NEON part of the SVE register so + // we can use more efficient NEON instructions to do the work. + auto Bottom128 = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Op.getOperand(0), + DAG.getConstant(0, DL, MVT::i64)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Bottom128, Op.getOperand(1)); + } + + // We only care about type legalisation from this point onwards. + if (isTypeLegal(VT)) + return Op; + return SDValue(); + } + // Only SVE supports non constant extracts + if (!CIdx || (CIdx && CIdx->getZExtValue() >= VT.getVectorNumElements())) + return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || @@ -8413,7 +8458,6 @@ // For V64 types, we perform extraction by expanding the value // to a V128 type and perform the extraction on that. - SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); @@ -11248,6 +11292,46 @@ return DAG.getZExtOrTrunc(Res, DL, VT); } +static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue VecToReduce = N->getOperand(2); + + EVT ReduceVT = VecToReduce.getValueType(); + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + +static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Pred = N->getOperand(1); + SDValue InitVal = N->getOperand(2); + SDValue VecToReduce = N->getOperand(3); + EVT ReduceVT = VecToReduce.getValueType(); + + // Ordered reductions use the first lane of the result vector as the + // reduction's initial value. + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT, + DAG.getUNDEF(ReduceVT), InitVal, Zero); + + SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce); + + // SVE reductions set the whole vector register with the first element + // containing the reduction result, which we'll now extract. + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce, + Zero); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -11325,6 +11409,18 @@ N->getOperand(1)); case Intrinsic::aarch64_sve_ext: return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_fadda: + return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG); + case Intrinsic::aarch64_sve_faddv: + return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxnmv: + return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fmaxv: + return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_fminnmv: + return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG); + case Intrinsic::aarch64_sve_fminv: + return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG); case Intrinsic::aarch64_sve_sel: return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2), N->getOperand(3)); Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -8436,8 +8436,8 @@ asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (extract_high_v8i16 (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8463,8 +8463,8 @@ asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (extract_high_v4i32 (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8529,8 +8529,8 @@ (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn), (extract_high_v8i16 - (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))))]> { + (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8561,8 +8561,8 @@ (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn), (extract_high_v4i32 - (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))))]> { + (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8616,8 +8616,8 @@ asm#"2", ".4s", ".4s", ".8h", ".h", [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (extract_high_v8i16 (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; @@ -8643,8 +8643,8 @@ asm#"2", ".2d", ".2d", ".4s", ".s", [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (extract_high_v4i32 (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; @@ -8675,8 +8675,8 @@ [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (extract_high_v8i16 V128:$Rn), - (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), - VectorIndexH:$idx))))]> { + (extract_high_v8i16 (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), + VectorIndexH:$idx)))))]> { bits<3> idx; let Inst{11} = idx{2}; let Inst{21} = idx{1}; @@ -8702,8 +8702,8 @@ [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (extract_high_v4i32 V128:$Rn), - (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm), - VectorIndexS:$idx))))]> { + (extract_high_v4i32 (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))))]> { bits<2> idx; let Inst{11} = idx{1}; let Inst{21} = idx{0}; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5170,8 +5170,8 @@ // If none did, fallback to the explicit patterns, consuming the vector_extract. -def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)), - (i32 0)), (i64 0))), +def : Pat<(i32 (vector_extract (v16i8 (insert_subvector undef, (v8i8 (opNode V64:$Rn)), + (i32 0))), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), ssub)>; @@ -5179,8 +5179,8 @@ (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), ssub)>; -def : Pat<(i32 (vector_extract (insert_subvector undef, - (v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))), +def : Pat<(i32 (vector_extract (v8i16 (insert_subvector undef, + (v4i16 (opNode V64:$Rn)), (i32 0))), (i64 0))), (EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), ssub)>; @@ -5200,20 +5200,20 @@ : SIMDAcrossLanesIntrinsic { // If there is a sign extension after this intrinsic, consume it as smov already // performed it -def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, - (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)), - (i32 (SMOVvi8to32 - (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), - (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), - (i64 0)))>; +def : Pat<(i32 (sext_inreg (i32 (vector_extract (v16i8 (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0))), (i64 0))), i8)), + (i32 (SMOVvi8to32 + (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), + (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), + (i64 0)))>; def : Pat<(i32 (sext_inreg (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))), i8)), (i32 (SMOVvi8to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), (i64 0)))>; -def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef, - (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)), +def : Pat<(i32 (sext_inreg (i32 (vector_extract (v8i16 (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0))), (i64 0))), i16)), (i32 (SMOVvi16to32 (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), @@ -5231,8 +5231,8 @@ : SIMDAcrossLanesIntrinsic { // If there is a masking operation keeping only what has been actually // generated, consume it. -def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, - (opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)), +def : Pat<(i32 (and (i32 (vector_extract (v16i8 (insert_subvector undef, + (opNode (v8i8 V64:$Rn)), (i32 0))), (i64 0))), maski8_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub), @@ -5243,8 +5243,8 @@ (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub), ssub))>; -def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef, - (opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)), +def : Pat<(i32 (and (i32 (vector_extract (v8i16 (insert_subvector undef, + (opNode (v4i16 V64:$Rn)), (i32 0))), (i64 0))), maski16_or_more)), (i32 (EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), (!cast(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub), Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -110,20 +110,25 @@ def sve_cntd_imm_neg : ComplexPattern">; def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; - -def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; -def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; -def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; -def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; -def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; -def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; -def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; -def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; -def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; +def AArch64faddv_pred : SDNode<"AArch64ISD::FADDV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxv_pred : SDNode<"AArch64ISD::FMAXV_PRED", SDT_AArch64Reduce>; +def AArch64fmaxnmv_pred : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>; +def AArch64fminv_pred : SDNode<"AArch64ISD::FMINV_PRED", SDT_AArch64Reduce>; +def AArch64fminnmv_pred : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>; +def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; +def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; +def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; +def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; +def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; +def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; +def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; +def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; +def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; +def AArch64fadda_pred : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>; def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; @@ -320,12 +325,12 @@ defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; // SVE floating point reductions. - defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>; - defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>; - defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>; - defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>; - defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>; - defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>; + defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", AArch64fadda_pred>; + defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", AArch64faddv_pred>; + defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_pred>; + defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_pred>; + defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", AArch64fmaxv_pred>; + defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", AArch64fminv_pred>; // Splat immediate (unpredicated) defm DUP_ZI : sve_int_dup_imm<"dup">; @@ -1399,6 +1404,13 @@ def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)), (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>; + def : Pat<(v8f16 (extract_subvector ZPR:$Zs, (i64 0))), + (EXTRACT_SUBREG ZPR:$Zs, zsub)>; + def : Pat<(v4f32 (extract_subvector ZPR:$Zs, (i64 0))), + (EXTRACT_SUBREG ZPR:$Zs, zsub)>; + def : Pat<(v2f64 (extract_subvector ZPR:$Zs, (i64 0))), + (EXTRACT_SUBREG ZPR:$Zs, zsub)>; + // Add more complex addressing modes here as required multiclass pred_load { Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4444,8 +4444,8 @@ //===----------------------------------------------------------------------===// class sve_fp_fast_red sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn), asm, "\t$Vd, $Pg, $Zn", "", []>, Sched<[]> { @@ -4463,13 +4463,13 @@ } multiclass sve_fp_fast_red opc, string asm, SDPatternOperator op> { - def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_2_Op_Pat(NAME # _H)>; - def : SVE_2_Op_Pat(NAME # _S)>; - def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _H)>; + def : SVE_2_Op_Pat(NAME # _S)>; + def : SVE_2_Op_Pat(NAME # _D)>; } @@ -4478,8 +4478,8 @@ //===----------------------------------------------------------------------===// class sve_fp_2op_p_vd sz, bits<3> opc, string asm, - ZPRRegOp zprty, RegisterClass dstRegClass> -: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm), + ZPRRegOp zprty, FPRasZPROperand dstOpType> +: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm), asm, "\t$Vdn, $Pg, $_Vdn, $Zm", "", []>, @@ -4500,13 +4500,13 @@ } multiclass sve_fp_2op_p_vd opc, string asm, SDPatternOperator op> { - def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>; - def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>; - def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>; + def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>; + def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>; + def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>; - def : SVE_3_Op_Pat(NAME # _H)>; - def : SVE_3_Op_Pat(NAME # _S)>; - def : SVE_3_Op_Pat(NAME # _D)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AArch64/sve-intrinsics-fp-reduce.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-fp-reduce.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-fp-reduce.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s ; ; FADDA Index: llvm/utils/TableGen/CodeGenDAGPatterns.cpp =================================================================== --- llvm/utils/TableGen/CodeGenDAGPatterns.cpp +++ llvm/utils/TableGen/CodeGenDAGPatterns.cpp @@ -623,13 +623,12 @@ auto IsSubVec = [](MVT B, MVT P) -> bool { if (!B.isVector() || !P.isVector()) return false; - // Logically a <4 x i32> is a valid subvector of - // but until there are obvious use-cases for this, keep the - // types separate. - if (B.isScalableVector() != P.isScalableVector()) - return false; if (B.getVectorElementType() != P.getVectorElementType()) return false; + // Logically is a valid subvector of when + // k <= m. + if (!B.isScalableVector() && P.isScalableVector()) + return (B.getVectorNumElements() <= P.getVectorNumElements()); return B.getVectorNumElements() < P.getVectorNumElements(); };