Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -568,6 +568,7 @@ SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, std::vector *Created) const override; Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -553,7 +553,6 @@ setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); - setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; @@ -659,6 +658,19 @@ setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); + // Vector reductions + for (MVT VT : MVT::integer_valuetypes()) { + setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + } + for (MVT VT : MVT::fp_valuetypes()) { + setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); + setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); + } + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled @@ -2606,6 +2618,14 @@ return LowerMUL(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + return LowerVECREDUCE(Op, DAG); } } @@ -7128,6 +7148,47 @@ return Cmp; } +static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, + SelectionDAG &DAG) { + SDValue VecOp = ScalarOp.getOperand(0); + auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, + DAG.getConstant(0, DL, MVT::i64)); +} + +SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc dl(Op); + switch (Op.getOpcode()) { + case ISD::VECREDUCE_ADD: + return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); + case ISD::VECREDUCE_SMAX: + return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); + case ISD::VECREDUCE_SMIN: + return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); + case ISD::VECREDUCE_UMAX: + return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); + case ISD::VECREDUCE_UMIN: + return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); + case ISD::VECREDUCE_FMAX: { + assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), + Op.getOperand(0)); + } + case ISD::VECREDUCE_FMIN: { + assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), + DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), + Op.getOperand(0)); + } + default: + llvm_unreachable("Unhandled reduction"); + } +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. @@ -9490,266 +9551,6 @@ return SDValue(); } -/// This function handles the log2-shuffle pattern produced by the -/// LoopVectorizer for the across vector reduction. It consists of -/// log2(NumVectorElements) steps and, in each step, 2^(s) elements -/// are reduced, where s is an induction variable from 0 to -/// log2(NumVectorElements). -static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV, - unsigned Op, - SelectionDAG &DAG) { - EVT VTy = OpV->getOperand(0).getValueType(); - if (!VTy.isVector()) - return SDValue(); - - int NumVecElts = VTy.getVectorNumElements(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (NumVecElts != 4) - return SDValue(); - } else { - if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16) - return SDValue(); - } - - int NumExpectedSteps = APInt(8, NumVecElts).logBase2(); - SDValue PreOp = OpV; - // Iterate over each step of the across vector reduction. - for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) { - SDValue CurOp = PreOp.getOperand(0); - SDValue Shuffle = PreOp.getOperand(1); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) { - // Try to swap the 1st and 2nd operand as add and min/max instructions - // are commutative. - CurOp = PreOp.getOperand(1); - Shuffle = PreOp.getOperand(0); - if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) - return SDValue(); - } - - // Check if the input vector is fed by the operator we want to handle, - // except the last step; the very first input vector is not necessarily - // the same operator we are handling. - if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1))) - return SDValue(); - - // Check if it forms one step of the across vector reduction. - // E.g., - // %cur = add %1, %0 - // %shuffle = vector_shuffle %cur, <2, 3, u, u> - // %pre = add %cur, %shuffle - if (Shuffle.getOperand(0) != CurOp) - return SDValue(); - - int NumMaskElts = 1 << CurStep; - ArrayRef Mask = cast(Shuffle)->getMask(); - // Check mask values in each step. - // We expect the shuffle mask in each step follows a specific pattern - // denoted here by the form, where M is a sequence of integers - // starting from NumMaskElts, increasing by 1, and the number integers - // in M should be NumMaskElts. U is a sequence of UNDEFs and the number - // of undef in U should be NumVecElts - NumMaskElts. - // E.g., for <8 x i16>, mask values in each step should be : - // step 0 : <1,u,u,u,u,u,u,u> - // step 1 : <2,3,u,u,u,u,u,u> - // step 2 : <4,5,6,7,u,u,u,u> - for (int i = 0; i < NumVecElts; ++i) - if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) || - (i >= NumMaskElts && !(Mask[i] < 0))) - return SDValue(); - - PreOp = CurOp; - } - unsigned Opcode; - bool IsIntrinsic = false; - - switch (Op) { - default: - llvm_unreachable("Unexpected operator for across vector reduction"); - case ISD::ADD: - Opcode = AArch64ISD::UADDV; - break; - case ISD::SMAX: - Opcode = AArch64ISD::SMAXV; - break; - case ISD::UMAX: - Opcode = AArch64ISD::UMAXV; - break; - case ISD::SMIN: - Opcode = AArch64ISD::SMINV; - break; - case ISD::UMIN: - Opcode = AArch64ISD::UMINV; - break; - case ISD::FMAXNUM: - Opcode = Intrinsic::aarch64_neon_fmaxnmv; - IsIntrinsic = true; - break; - case ISD::FMINNUM: - Opcode = Intrinsic::aarch64_neon_fminnmv; - IsIntrinsic = true; - break; - } - SDLoc DL(N); - - return IsIntrinsic - ? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0), - DAG.getConstant(Opcode, DL, MVT::i32), PreOp) - : DAG.getNode( - ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), - DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp), - DAG.getConstant(0, DL, MVT::i64)); -} - -/// Target-specific DAG combine for the across vector min/max reductions. -/// This function specifically handles the final clean-up step of the vector -/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which narrows down and finds the final min/max value from all -/// elements of the vector. -/// For example, for a <16 x i8> vector : -/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> -/// %smax0 = smax %arr, svn0 -/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax1 = smax %smax0, %svn1 -/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %smax2 = smax %smax1, svn2 -/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -/// %sc = setcc %smax2, %svn3, gt -/// %n0 = extract_vector_elt %sc, #0 -/// %n1 = extract_vector_elt %smax2, #0 -/// %n2 = extract_vector_elt $smax2, #1 -/// %result = select %n0, %n1, n2 -/// becomes : -/// %1 = smaxv %0 -/// %result = extract_vector_elt %1, 0 -static SDValue -performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue IfTrue = N->getOperand(1); - SDValue IfFalse = N->getOperand(2); - - // Check if the SELECT merges up the final result of the min/max - // from a vector. - if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT) - return SDValue(); - - // Expect N0 is fed by SETCC. - SDValue SetCC = N0.getOperand(0); - EVT SetCCVT = SetCC.getValueType(); - if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() || - SetCCVT.getVectorElementType() != MVT::i1) - return SDValue(); - - SDValue VectorOp = SetCC.getOperand(0); - unsigned Op = VectorOp->getOpcode(); - // Check if the input vector is fed by the operator we want to handle. - if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN && - Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM) - return SDValue(); - - EVT VTy = VectorOp.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) { - if (EltTy != MVT::f32) - return SDValue(); - } else { - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - } - - // Check if extracting from the same vector. - // For example, - // %sc = setcc %vector, %svn1, gt - // %n0 = extract_vector_elt %sc, #0 - // %n1 = extract_vector_elt %vector, #0 - // %n2 = extract_vector_elt $vector, #1 - if (!(VectorOp == IfTrue->getOperand(0) && - VectorOp == IfFalse->getOperand(0))) - return SDValue(); - - // Check if the condition code is matched with the operator type. - ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); - if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) || - (Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) || - (Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) || - (Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) || - (Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE && - CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT && - CC != ISD::SETGE) || - (Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE && - CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT && - CC != ISD::SETLE)) - return SDValue(); - - // Expect to check only lane 0 from the vector SETCC. - if (!isNullConstant(N0.getOperand(1))) - return SDValue(); - - // Expect to extract the true value from lane 0. - if (!isNullConstant(IfTrue.getOperand(1))) - return SDValue(); - - // Expect to extract the false value from lane 1. - if (!isOneConstant(IfFalse.getOperand(1))) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG); -} - -/// Target-specific DAG combine for the across vector add reduction. -/// This function specifically handles the final clean-up step of the vector -/// add reduction produced by the LoopVectorizer. It is the log2-shuffle -/// pattern, which adds all elements of a vector together. -/// For example, for a <4 x i32> vector : -/// %1 = vector_shuffle %0, <2,3,u,u> -/// %2 = add %0, %1 -/// %3 = vector_shuffle %2, <1,u,u,u> -/// %4 = add %2, %3 -/// %result = extract_vector_elt %4, 0 -/// becomes : -/// %0 = uaddv %0 -/// %result = extract_vector_elt %0, 0 -static SDValue -performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG, - const AArch64Subtarget *Subtarget) { - if (!Subtarget->hasNEON()) - return SDValue(); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - // Check if the input vector is fed by the ADD. - if (N0->getOpcode() != ISD::ADD) - return SDValue(); - - // The vector extract idx must constant zero because we only expect the final - // result of the reduction is placed in lane 0. - if (!isNullConstant(N1)) - return SDValue(); - - EVT VTy = N0.getValueType(); - if (!VTy.isVector()) - return SDValue(); - - EVT EltTy = VTy.getVectorElementType(); - if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8) - return SDValue(); - - if (VTy.getSizeInBits() < 64) - return SDValue(); - - return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG); -} /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. @@ -10428,12 +10229,8 @@ return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); - case ISD::SELECT: { - SDValue RV = performSelectCombine(N, DCI); - if (!RV.getNode()) - RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget); - return RV; - } + case ISD::SELECT: + return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: @@ -10455,8 +10252,6 @@ return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); - case ISD::EXTRACT_VECTOR_ELT: - return performAcrossLaneAddReductionCombine(N, DAG, Subtarget); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { @@ -10676,6 +10471,14 @@ case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); + return; + case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -141,6 +141,9 @@ bool shouldExpandReduction(const IntrinsicInst *II) const { return false; } + + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const; /// @} }; Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -769,3 +769,26 @@ unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() { return ST->getMaxPrefetchIterationsAhead(); } + +bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + assert(isa(Ty) && "Expected Ty to be a vector type"); + switch (Opcode) { + case Instruction::FAdd: + case Instruction::FMul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Mul: + return false; + case Instruction::Add: + return Ty->getScalarSizeInBits() * Ty->getVectorNumElements() >= 128; + case Instruction::ICmp: + return Ty->getScalarSizeInBits() < 64; + case Instruction::FCmp: + return Flags.NoNaN; + default: + llvm_unreachable("Unhandled reduction opcode"); + } + return false; +} Index: test/CodeGen/AArch64/aarch64-addv.ll =================================================================== --- test/CodeGen/AArch64/aarch64-addv.ll +++ test/CodeGen/AArch64/aarch64-addv.ll @@ -1,18 +1,16 @@ ; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=generic | FileCheck %s +; Function Attrs: nounwind readnone +declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32>) +declare i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16>) +declare i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8>) + define i8 @add_B(<16 x i8>* %arr) { ; CHECK-LABEL: add_B ; CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b %bin.rdx = load <16 x i8>, <16 x i8>* %arr - %rdx.shuf0 = shufflevector <16 x i8> %bin.rdx, <16 x i8> undef, <16 x i32> - %bin.rdx0 = add <16 x i8> %bin.rdx, %rdx.shuf0 - %rdx.shuf = shufflevector <16 x i8> %bin.rdx0, <16 x i8> undef, <16 x i32> - %bin.rdx11 = add <16 x i8> %bin.rdx0, %rdx.shuf - %rdx.shuf12 = shufflevector <16 x i8> %bin.rdx11, <16 x i8> undef, <16 x i32> - %bin.rdx13 = add <16 x i8> %bin.rdx11, %rdx.shuf12 - %rdx.shuf13 = shufflevector <16 x i8> %bin.rdx13, <16 x i8> undef, <16 x i32> - %bin.rdx14 = add <16 x i8> %bin.rdx13, %rdx.shuf13 - %r = extractelement <16 x i8> %bin.rdx14, i32 0 + %r = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> %bin.rdx) ret i8 %r } @@ -20,13 +18,7 @@ ; CHECK-LABEL: add_H ; CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h %bin.rdx = load <8 x i16>, <8 x i16>* %arr - %rdx.shuf = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> - %bin.rdx11 = add <8 x i16> %bin.rdx, %rdx.shuf - %rdx.shuf12 = shufflevector <8 x i16> %bin.rdx11, <8 x i16> undef, <8 x i32> - %bin.rdx13 = add <8 x i16> %bin.rdx11, %rdx.shuf12 - %rdx.shuf13 = shufflevector <8 x i16> %bin.rdx13, <8 x i16> undef, <8 x i32> - %bin.rdx14 = add <8 x i16> %bin.rdx13, %rdx.shuf13 - %r = extractelement <8 x i16> %bin.rdx14, i32 0 + %r = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> %bin.rdx) ret i16 %r } @@ -34,11 +26,7 @@ ; CHECK-LABEL: add_S ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <4 x i32>, <4 x i32>* %arr - %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> - %bin.rdx11 = add <4 x i32> %bin.rdx, %rdx.shuf - %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> - %bin.rdx13 = add <4 x i32> %bin.rdx11, %rdx.shuf12 - %r = extractelement <4 x i32> %bin.rdx13, i32 0 + %r = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %bin.rdx) ret i32 %r } @@ -46,12 +34,12 @@ ; CHECK-LABEL: add_D ; CHECK-NOT: addv %bin.rdx = load <2 x i64>, <2 x i64>* %arr - %rdx.shuf0 = shufflevector <2 x i64> %bin.rdx, <2 x i64> undef, <2 x i32> - %bin.rdx0 = add <2 x i64> %bin.rdx, %rdx.shuf0 - %r = extractelement <2 x i64> %bin.rdx0, i32 0 + %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %bin.rdx) ret i64 %r } +declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) + define i32 @oversized_ADDV_256(i8* noalias nocapture readonly %arg1, i8* noalias nocapture readonly %arg2) { ; CHECK-LABEL: oversized_ADDV_256 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s @@ -66,33 +54,16 @@ %7 = icmp slt <8 x i32> %6, zeroinitializer %8 = sub nsw <8 x i32> zeroinitializer, %6 %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6 - %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> - %bin.rdx = add <8 x i32> %9, %rdx.shuf - %rdx.shuf1 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> - %bin.rdx2 = add <8 x i32> %bin.rdx, %rdx.shuf1 - %rdx.shuf3 = shufflevector <8 x i32> %bin.rdx2, <8 x i32> undef, <8 x i32> - %bin.rdx4 = add <8 x i32> %bin.rdx2, %rdx.shuf3 - %10 = extractelement <8 x i32> %bin.rdx4, i32 0 - ret i32 %10 + %r = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %9) + ret i32 %r } +declare i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32>) + define i32 @oversized_ADDV_512(<16 x i32>* %arr) { ; CHECK-LABEL: oversized_ADDV_512 ; CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s %bin.rdx = load <16 x i32>, <16 x i32>* %arr - - %rdx.shuf0 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> - %bin.rdx0 = add <16 x i32> %bin.rdx, %rdx.shuf0 - - %rdx.shuf = shufflevector <16 x i32> %bin.rdx0, <16 x i32> undef, <16 x i32> - %bin.rdx11 = add <16 x i32> %bin.rdx0, %rdx.shuf - - %rdx.shuf12 = shufflevector <16 x i32> %bin.rdx11, <16 x i32> undef, <16 x i32> - %bin.rdx13 = add <16 x i32> %bin.rdx11, %rdx.shuf12 - - %rdx.shuf13 = shufflevector <16 x i32> %bin.rdx13, <16 x i32> undef, <16 x i32> - %bin.rdx14 = add <16 x i32> %bin.rdx13, %rdx.shuf13 - - %r = extractelement <16 x i32> %bin.rdx14, i32 0 + %r = call i32 @llvm.experimental.vector.reduce.add.i32.v16i32(<16 x i32> %bin.rdx) ret i32 %r } Index: test/CodeGen/AArch64/aarch64-minmaxv.ll =================================================================== --- test/CodeGen/AArch64/aarch64-minmaxv.ll +++ test/CodeGen/AArch64/aarch64-minmaxv.ll @@ -2,344 +2,148 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +declare i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8>) +declare i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16>) +declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32>) +declare i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8>) +declare i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16>) +declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32>) + +declare i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8>) +declare i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16>) +declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32>) +declare i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8>) +declare i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16>) +declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32>) + +declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float>) +declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float>) + ; CHECK-LABEL: smax_B ; CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smax_B(<16 x i8>* nocapture readonly %arr) { %arr.load = load <16 x i8>, <16 x i8>* %arr - %rdx.shuf = shufflevector <16 x i8> %arr.load, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp sgt <16 x i8> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %arr.load, <16 x i8> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp sgt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp sgt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp sgt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + %r = call i8 @llvm.experimental.vector.reduce.smax.i8.v16i8(<16 x i8> %arr.load) ret i8 %r } ; CHECK-LABEL: smax_H ; CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smax_H(<8 x i16>* nocapture readonly %arr) { - %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr - %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp23 = icmp sgt <8 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf - %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp26 = icmp sgt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 - %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 - %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp29 = icmp sgt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 - %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 - %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 - %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 - %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + %arr.load = load <8 x i16>, <8 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v8i16(<8 x i16> %arr.load) ret i16 %r } ; CHECK-LABEL: smax_S ; CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smax_S(<4 x i32> * nocapture readonly %arr) { - %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr - %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp18 = icmp sgt <4 x i32> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf - %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp21 = icmp sgt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 - %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 - %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 - %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 - %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + %arr.load = load <4 x i32>, <4 x i32>* %arr + %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %arr.load) ret i32 %r } -; CHECK-LABEL: smax_D -; CHECK-NOT: smaxv -define i64 @smax_D(<2 x i64>* nocapture readonly %arr) { - %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr - %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> - %rdx.minmax.cmp18 = icmp sgt <2 x i64> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 - %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 - %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 - %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt - ret i64 %r -} - - ; CHECK-LABEL: umax_B ; CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umax_B(<16 x i8>* nocapture readonly %arr) { - %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr - %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ugt <16 x i8> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ugt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ugt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ugt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + %arr.load = load <16 x i8>, <16 x i8>* %arr + %r = call i8 @llvm.experimental.vector.reduce.umax.i8.v16i8(<16 x i8> %arr.load) ret i8 %r } ; CHECK-LABEL: umax_H ; CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umax_H(<8 x i16>* nocapture readonly %arr) { - %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr - %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp23 = icmp ugt <8 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf - %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp26 = icmp ugt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 - %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 - %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp29 = icmp ugt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 - %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 - %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 - %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 - %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + %arr.load = load <8 x i16>, <8 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v8i16(<8 x i16> %arr.load) ret i16 %r } ; CHECK-LABEL: umax_S ; CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umax_S(<4 x i32>* nocapture readonly %arr) { - %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr - %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp18 = icmp ugt <4 x i32> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf - %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp21 = icmp ugt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 - %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 - %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 - %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 - %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + %arr.load = load <4 x i32>, <4 x i32>* %arr + %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %arr.load) ret i32 %r } -; CHECK-LABEL: umax_D -; CHECK-NOT: umaxv -define i64 @umax_D(<2 x i64>* nocapture readonly %arr) { - %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr - %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> - %rdx.minmax.cmp18 = icmp ugt <2 x i64> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 - %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 - %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 - %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt - ret i64 %r -} - - ; CHECK-LABEL: smin_B ; CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @smin_B(<16 x i8>* nocapture readonly %arr) { - %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr - %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp slt <16 x i8> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp slt <16 x i8> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp slt <16 x i8> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp slt <16 x i8> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + %arr.load = load <16 x i8>, <16 x i8>* %arr + %r = call i8 @llvm.experimental.vector.reduce.smin.i8.v16i8(<16 x i8> %arr.load) ret i8 %r } ; CHECK-LABEL: smin_H ; CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @smin_H(<8 x i16>* nocapture readonly %arr) { - %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr - %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp23 = icmp slt <8 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf - %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp26 = icmp slt <8 x i16> %rdx.minmax.select24, %rdx.shuf25 - %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 - %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp29 = icmp slt <8 x i16> %rdx.minmax.select27, %rdx.shuf28 - %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 - %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 - %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 - %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + %arr.load = load <8 x i16>, <8 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v8i16(<8 x i16> %arr.load) ret i16 %r } ; CHECK-LABEL: smin_S ; CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @smin_S(<4 x i32>* nocapture readonly %arr) { - %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr - %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp18 = icmp slt <4 x i32> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf - %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp21 = icmp slt <4 x i32> %rdx.minmax.select19, %rdx.shuf20 - %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 - %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 - %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 - %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + %arr.load = load <4 x i32>, <4 x i32>* %arr + %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %arr.load) ret i32 %r } -; CHECK-LABEL: smin_D -; CHECK-NOT: sminv -define i64 @smin_D(<2 x i64>* nocapture readonly %arr) { - %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr - %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> - %rdx.minmax.cmp18 = icmp slt <2 x i64> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 - %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 - %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 - %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt - ret i64 %r -} - - ; CHECK-LABEL: umin_B ; CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b define i8 @umin_B(<16 x i8>* nocapture readonly %arr) { - %rdx.minmax.select = load <16 x i8>, <16 x i8>* %arr - %rdx.shuf = shufflevector <16 x i8> %rdx.minmax.select, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ult <16 x i8> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i8> %rdx.minmax.select, <16 x i8> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i8> %rdx.minmax.select23, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ult <16 x i8> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i8> %rdx.minmax.select23, <16 x i8> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i8> %rdx.minmax.select26, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ult <16 x i8> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i8> %rdx.minmax.select26, <16 x i8> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i8> %rdx.minmax.select29, <16 x i8> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ult <16 x i8> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i8> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i8 %rdx.minmax.select29.elt, i8 %rdx.shuf30.elt + %arr.load = load <16 x i8>, <16 x i8>* %arr + %r = call i8 @llvm.experimental.vector.reduce.umin.i8.v16i8(<16 x i8> %arr.load) ret i8 %r } ; CHECK-LABEL: umin_H ; CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h define i16 @umin_H(<8 x i16>* nocapture readonly %arr) { - %rdx.minmax.select = load <8 x i16>, <8 x i16>* %arr - %rdx.shuf = shufflevector <8 x i16> %rdx.minmax.select, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp23 = icmp ult <8 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select24 = select <8 x i1> %rdx.minmax.cmp23, <8 x i16> %rdx.minmax.select, <8 x i16> %rdx.shuf - %rdx.shuf25 = shufflevector <8 x i16> %rdx.minmax.select24, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp26 = icmp ult <8 x i16> %rdx.minmax.select24, %rdx.shuf25 - %rdx.minmax.select27 = select <8 x i1> %rdx.minmax.cmp26, <8 x i16> %rdx.minmax.select24, <8 x i16> %rdx.shuf25 - %rdx.shuf28 = shufflevector <8 x i16> %rdx.minmax.select27, <8 x i16> undef, <8 x i32> - %rdx.minmax.cmp29 = icmp ult <8 x i16> %rdx.minmax.select27, %rdx.shuf28 - %rdx.minmax.cmp29.elt = extractelement <8 x i1> %rdx.minmax.cmp29, i32 0 - %rdx.minmax.select27.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 0 - %rdx.shuf28.elt = extractelement <8 x i16> %rdx.minmax.select27, i32 1 - %r = select i1 %rdx.minmax.cmp29.elt, i16 %rdx.minmax.select27.elt, i16 %rdx.shuf28.elt + %arr.load = load <8 x i16>, <8 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v8i16(<8 x i16> %arr.load) ret i16 %r } ; CHECK-LABEL: umin_S ; CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s define i32 @umin_S(<4 x i32>* nocapture readonly %arr) { - %rdx.minmax.select = load <4 x i32>, <4 x i32>* %arr - %rdx.shuf = shufflevector <4 x i32> %rdx.minmax.select, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp18 = icmp ult <4 x i32> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select19 = select <4 x i1> %rdx.minmax.cmp18, <4 x i32> %rdx.minmax.select, <4 x i32> %rdx.shuf - %rdx.shuf20 = shufflevector <4 x i32> %rdx.minmax.select19, <4 x i32> undef, <4 x i32> - %rdx.minmax.cmp21 = icmp ult <4 x i32> %rdx.minmax.select19, %rdx.shuf20 - %rdx.minmax.cmp21.elt = extractelement <4 x i1> %rdx.minmax.cmp21, i32 0 - %rdx.minmax.select19.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 0 - %rdx.shuf20.elt = extractelement <4 x i32> %rdx.minmax.select19, i32 1 - %r = select i1 %rdx.minmax.cmp21.elt, i32 %rdx.minmax.select19.elt, i32 %rdx.shuf20.elt + %arr.load = load <4 x i32>, <4 x i32>* %arr + %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %arr.load) ret i32 %r } -; CHECK-LABEL: umin_D -; CHECK-NOT: uminv -define i64 @umin_D(<2 x i64>* nocapture readonly %arr) { - %rdx.minmax.select = load <2 x i64>, <2 x i64>* %arr - %rdx.shuf = shufflevector <2 x i64> %rdx.minmax.select, <2 x i64> undef, <2 x i32> - %rdx.minmax.cmp18 = icmp ult <2 x i64> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.cmp18.elt = extractelement <2 x i1> %rdx.minmax.cmp18, i32 0 - %rdx.minmax.select.elt = extractelement <2 x i64> %rdx.minmax.select, i32 0 - %rdx.shuf.elt = extractelement <2 x i64> %rdx.minmax.select, i32 1 - %r = select i1 %rdx.minmax.cmp18.elt, i64 %rdx.minmax.select.elt, i64 %rdx.shuf.elt - ret i64 %r -} - ; CHECK-LABEL: fmaxnm_S ; CHECK: fmaxnmv define float @fmaxnm_S(<4 x float>* nocapture readonly %arr) { - %rdx.minmax.select = load <4 x float>, <4 x float>* %arr - %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> - %rdx.minmax.cmp = fcmp fast oge <4 x float> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf - %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> - %rdx.minmax.cmp1 = fcmp fast oge <4 x float> %rdx.minmax.select1, %rdx.shuf1 - %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 - %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 - %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 - %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt + %arr.load = load <4 x float>, <4 x float>* %arr + %r = call nnan float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %arr.load) ret float %r } ; CHECK-LABEL: fminnm_S ; CHECK: fminnmv define float @fminnm_S(<4 x float>* nocapture readonly %arr) { - %rdx.minmax.select = load <4 x float>, <4 x float>* %arr - %rdx.shuf = shufflevector <4 x float> %rdx.minmax.select, <4 x float> undef, <4 x i32> - %rdx.minmax.cmp = fcmp fast ole <4 x float> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select1 = select <4 x i1> %rdx.minmax.cmp, <4 x float> %rdx.minmax.select, <4 x float> %rdx.shuf - %rdx.shuf1 = shufflevector <4 x float> %rdx.minmax.select1, <4 x float> undef, <4 x i32> - %rdx.minmax.cmp1 = fcmp fast ole <4 x float> %rdx.minmax.select1, %rdx.shuf1 - %rdx.minmax.cmp1.elt = extractelement <4 x i1> %rdx.minmax.cmp1, i32 0 - %rdx.minmax.select1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 0 - %rdx.shuf1.elt = extractelement <4 x float> %rdx.minmax.select1, i32 1 - %r = select i1 %rdx.minmax.cmp1.elt, float %rdx.minmax.select1.elt, float %rdx.shuf1.elt + %arr.load = load <4 x float>, <4 x float>* %arr + %r = call nnan float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %arr.load) ret float %r } +declare i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16>) + define i16 @oversized_umax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_256 ; CHECK: umax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: umaxv {{h[0-9]+}}, [[V0]] - %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr - %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ugt <16 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ugt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ugt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ugt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + %arr.load = load <16 x i16>, <16 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.umax.i16.v16i16(<16 x i16> %arr.load) ret i16 %r } +declare i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32>) + define i32 @oversized_umax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umax_512 ; CHECK: umax v @@ -347,47 +151,23 @@ ; CHECK-NEXT: umax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: umaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ugt <16 x i32> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ugt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ugt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ugt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + %r = call i32 @llvm.experimental.vector.reduce.umax.i32.v16i32(<16 x i32> %arr.load) ret i32 %r } +declare i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16>) + define i16 @oversized_umin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_256 ; CHECK: umin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: uminv {{h[0-9]+}}, [[V0]] - %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr - %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ult <16 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ult <16 x i16> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ult <16 x i16> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ult <16 x i16> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + %arr.load = load <16 x i16>, <16 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.umin.i16.v16i16(<16 x i16> %arr.load) ret i16 %r } +declare i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32>) + define i32 @oversized_umin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_umin_512 ; CHECK: umin v @@ -395,47 +175,23 @@ ; CHECK-NEXT: umin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: uminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp ult <16 x i32> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp ult <16 x i32> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp ult <16 x i32> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp ult <16 x i32> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + %r = call i32 @llvm.experimental.vector.reduce.umin.i32.v16i32(<16 x i32> %arr.load) ret i32 %r } +declare i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16>) + define i16 @oversized_smax_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_256 ; CHECK: smax [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: smaxv {{h[0-9]+}}, [[V0]] %arr.load = load <16 x i16>, <16 x i16>* %arr - %rdx.shuf = shufflevector <16 x i16> %arr.load, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp sgt <16 x i16> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %arr.load, <16 x i16> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp sgt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp sgt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp sgt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + %r = call i16 @llvm.experimental.vector.reduce.smax.i16.v16i16(<16 x i16> %arr.load) ret i16 %r } +declare i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32>) + define i32 @oversized_smax_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smax_512 ; CHECK: smax v @@ -443,47 +199,23 @@ ; CHECK-NEXT: smax [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: smaxv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp sgt <16 x i32> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp sgt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp sgt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp sgt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + %r = call i32 @llvm.experimental.vector.reduce.smax.i32.v16i32(<16 x i32> %arr.load) ret i32 %r } +declare i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16>) + define i16 @oversized_smin_256(<16 x i16>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_256 ; CHECK: smin [[V0:v[0-9]+]].8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h ; CHECK: sminv {{h[0-9]+}}, [[V0]] - %rdx.minmax.select = load <16 x i16>, <16 x i16>* %arr - %rdx.shuf = shufflevector <16 x i16> %rdx.minmax.select, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp slt <16 x i16> %rdx.minmax.select, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i16> %rdx.minmax.select, <16 x i16> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i16> %rdx.minmax.select23, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp slt <16 x i16> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i16> %rdx.minmax.select23, <16 x i16> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i16> %rdx.minmax.select26, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp slt <16 x i16> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i16> %rdx.minmax.select26, <16 x i16> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i16> %rdx.minmax.select29, <16 x i16> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp slt <16 x i16> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i16> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i16 %rdx.minmax.select29.elt, i16 %rdx.shuf30.elt + %arr.load = load <16 x i16>, <16 x i16>* %arr + %r = call i16 @llvm.experimental.vector.reduce.smin.i16.v16i16(<16 x i16> %arr.load) ret i16 %r } +declare i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32>) + define i32 @oversized_smin_512(<16 x i32>* nocapture readonly %arr) { ; CHECK-LABEL: oversized_smin_512 ; CHECK: smin v @@ -491,20 +223,6 @@ ; CHECK-NEXT: smin [[V0:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s ; CHECK-NEXT: sminv {{s[0-9]+}}, [[V0]] %arr.load = load <16 x i32>, <16 x i32>* %arr - %rdx.shuf = shufflevector <16 x i32> %arr.load, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp22 = icmp slt <16 x i32> %arr.load, %rdx.shuf - %rdx.minmax.select23 = select <16 x i1> %rdx.minmax.cmp22, <16 x i32> %arr.load, <16 x i32> %rdx.shuf - %rdx.shuf24 = shufflevector <16 x i32> %rdx.minmax.select23, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp25 = icmp slt <16 x i32> %rdx.minmax.select23, %rdx.shuf24 - %rdx.minmax.select26 = select <16 x i1> %rdx.minmax.cmp25, <16 x i32> %rdx.minmax.select23, <16 x i32> %rdx.shuf24 - %rdx.shuf27 = shufflevector <16 x i32> %rdx.minmax.select26, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp28 = icmp slt <16 x i32> %rdx.minmax.select26, %rdx.shuf27 - %rdx.minmax.select29 = select <16 x i1> %rdx.minmax.cmp28, <16 x i32> %rdx.minmax.select26, <16 x i32> %rdx.shuf27 - %rdx.shuf30 = shufflevector <16 x i32> %rdx.minmax.select29, <16 x i32> undef, <16 x i32> - %rdx.minmax.cmp31 = icmp slt <16 x i32> %rdx.minmax.select29, %rdx.shuf30 - %rdx.minmax.cmp31.elt = extractelement <16 x i1> %rdx.minmax.cmp31, i32 0 - %rdx.minmax.select29.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 0 - %rdx.shuf30.elt = extractelement <16 x i32> %rdx.minmax.select29, i32 1 - %r = select i1 %rdx.minmax.cmp31.elt, i32 %rdx.minmax.select29.elt, i32 %rdx.shuf30.elt + %r = call i32 @llvm.experimental.vector.reduce.smin.i32.v16i32(<16 x i32> %arr.load) ret i32 %r } Index: test/CodeGen/AArch64/arm64-vabs.ll =================================================================== --- test/CodeGen/AArch64/arm64-vabs.ll +++ test/CodeGen/AArch64/arm64-vabs.ll @@ -134,8 +134,10 @@ ret <2 x i64> %tmp4 } -define i16 @uabdl8h_log2_shuffle(<16 x i8>* %a, <16 x i8>* %b) { -; CHECK-LABEL: uabdl8h_log2_shuffle +declare i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16>) + +define i16 @uabdl8h_rdx(<16 x i8>* %a, <16 x i8>* %b) { +; CHECK-LABEL: uabdl8h_rdx ; CHECK: uabdl2.8h ; CHECK: uabdl.8h %aload = load <16 x i8>, <16 x i8>* %a, align 1 @@ -146,20 +148,14 @@ %abcmp = icmp slt <16 x i16> %abdiff, zeroinitializer %ababs = sub nsw <16 x i16> zeroinitializer, %abdiff %absel = select <16 x i1> %abcmp, <16 x i16> %ababs, <16 x i16> %abdiff - %rdx.shuf = shufflevector <16 x i16> %absel, <16 x i16> undef, <16 x i32> - %bin1.rdx = add <16 x i16> %absel, %rdx.shuf - %rdx.shufx = shufflevector <16 x i16> %bin1.rdx, <16 x i16> undef, <16 x i32> - %bin.rdx = add <16 x i16> %bin1.rdx, %rdx.shufx - %rdx.shuf136 = shufflevector <16 x i16> %bin.rdx, <16 x i16> undef, <16 x i32> - %bin.rdx137 = add <16 x i16> %bin.rdx, %rdx.shuf136 - %rdx.shuf138 = shufflevector <16 x i16> %bin.rdx137, <16 x i16> undef, <16 x i32> - %bin.rdx139 = add <16 x i16> %bin.rdx137, %rdx.shuf138 - %reduced_v = extractelement <16 x i16> %bin.rdx139, i16 0 + %reduced_v = call i16 @llvm.experimental.vector.reduce.add.i16.v16i16(<16 x i16> %absel) ret i16 %reduced_v } -define i32 @uabdl4s_log2_shuffle(<8 x i16>* %a, <8 x i16>* %b) { -; CHECK-LABEL: uabdl4s_log2_shuffle +declare i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32>) + +define i32 @uabdl4s_rdx(<8 x i16>* %a, <8 x i16>* %b) { +; CHECK-LABEL: uabdl4s_rdx ; CHECK: uabdl2.4s ; CHECK: uabdl.4s %aload = load <8 x i16>, <8 x i16>* %a, align 1 @@ -170,18 +166,14 @@ %abcmp = icmp slt <8 x i32> %abdiff, zeroinitializer %ababs = sub nsw <8 x i32> zeroinitializer, %abdiff %absel = select <8 x i1> %abcmp, <8 x i32> %ababs, <8 x i32> %abdiff - %rdx.shuf = shufflevector <8 x i32> %absel, <8 x i32> undef, <8 x i32> - %bin.rdx = add <8 x i32> %absel, %rdx.shuf - %rdx.shuf136 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> - %bin.rdx137 = add <8 x i32> %bin.rdx, %rdx.shuf136 - %rdx.shuf138 = shufflevector <8 x i32> %bin.rdx137, <8 x i32> undef, <8 x i32> - %bin.rdx139 = add <8 x i32> %bin.rdx137, %rdx.shuf138 - %reduced_v = extractelement <8 x i32> %bin.rdx139, i32 0 + %reduced_v = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %absel) ret i32 %reduced_v } -define i64 @uabdl2d_log2_shuffle(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { -; CHECK: uabdl2d_log2_shuffle +declare i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64>) + +define i64 @uabdl2d_rdx(<4 x i32>* %a, <4 x i32>* %b, i32 %h) { +; CHECK: uabdl2d_rdx ; CHECK: uabdl2.2d ; CHECK: uabdl.2d %aload = load <4 x i32>, <4 x i32>* %a, align 1 @@ -192,11 +184,7 @@ %abcmp = icmp slt <4 x i64> %abdiff, zeroinitializer %ababs = sub nsw <4 x i64> zeroinitializer, %abdiff %absel = select <4 x i1> %abcmp, <4 x i64> %ababs, <4 x i64> %abdiff - %rdx.shuf136 = shufflevector <4 x i64> %absel, <4 x i64> undef, <4 x i32> - %bin.rdx137 = add <4 x i64> %absel, %rdx.shuf136 - %rdx.shuf138 = shufflevector <4 x i64> %bin.rdx137, <4 x i64> undef, <4 x i32> - %bin.rdx139 = add <4 x i64> %bin.rdx137, %rdx.shuf138 - %reduced_v = extractelement <4 x i64> %bin.rdx139, i16 0 + %reduced_v = call i64 @llvm.experimental.vector.reduce.add.i64.v4i64(<4 x i64> %absel) ret i64 %reduced_v } Index: test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -20,15 +20,7 @@ ; CHECK: add <16 x i8> ; ; CHECK: middle.block: -; CHECK: shufflevector <16 x i8> -; CHECK: add <16 x i8> -; CHECK: shufflevector <16 x i8> -; CHECK: add <16 x i8> -; CHECK: shufflevector <16 x i8> -; CHECK: add <16 x i8> -; CHECK: shufflevector <16 x i8> -; CHECK: add <16 x i8> -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <16 x i8> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.experimental.vector.reduce.add.i8.v16i8(<16 x i8> ; CHECK: zext i8 [[Rdx]] to i32 ; define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { @@ -83,13 +75,7 @@ ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_1(i16* nocapture readonly %a, i16* nocapture readonly %b, i32 %n) { @@ -146,13 +132,7 @@ ; CHECK: add <8 x i16> ; ; CHECK: middle.block: -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: shufflevector <8 x i16> -; CHECK: add <8 x i16> -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = extractelement <8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.experimental.vector.reduce.add.i16.v8i16(<8 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -11,14 +11,8 @@ ; DEFAULT-LABEL: @PR28330( ; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> , <8 x i32> -; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> -; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]] -; DEFAULT: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> -; DEFAULT: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] -; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> -; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] -; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 -; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17 +; DEFAULT: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[S0]]) +; DEFAULT: %bin.extra = add i32 %[[Rdx]], %tmp17 ; ; GATHER-LABEL: @PR28330( ; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ] @@ -38,14 +32,8 @@ ; GATHER: %[[I5:.+]] = insertelement <8 x i32> %[[I4]], i32 %tmp29, i32 5 ; GATHER: %[[I6:.+]] = insertelement <8 x i32> %[[I5]], i32 %tmp31, i32 6 ; GATHER: %[[I7:.+]] = insertelement <8 x i32> %[[I6]], i32 %tmp33, i32 7 -; GATHER: %[[R0:.+]] = shufflevector <8 x i32> %[[I7]], <8 x i32> undef, <8 x i32> -; GATHER: %[[R1:.+]] = add <8 x i32> %[[I7]], %[[R0]] -; GATHER: %[[R2:.+]] = shufflevector <8 x i32> %[[R1]], <8 x i32> undef, <8 x i32> -; GATHER: %[[R3:.+]] = add <8 x i32> %[[R1]], %[[R2]] -; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> -; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]] -; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0 -; GATHER: %bin.extra = add i32 %[[R6]], %tmp17 +; GATHER: %[[Rdx:.+]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> %[[I7]]) +; GATHER: %bin.extra = add i32 %[[Rdx]], %tmp17 ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NOT: shufflevector @@ -107,14 +95,8 @@ ; DEFAULT-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], undef ; DEFAULT-NEXT: [[TMP30:%.*]] = add i32 [[TMP28]], undef ; DEFAULT-NEXT: [[TMP32:%.*]] = add i32 [[TMP30]], undef -; DEFAULT-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> -; DEFAULT-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]] -; DEFAULT-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; DEFAULT-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; DEFAULT-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; DEFAULT-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; DEFAULT-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[TMP3]], -5 +; DEFAULT-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP2]]) +; DEFAULT-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5 ; DEFAULT-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], undef ; DEFAULT-NEXT: br label [[FOR_BODY]] ; @@ -162,14 +144,8 @@ ; GATHER-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5 ; GATHER-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6 ; GATHER-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7 -; GATHER-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> undef, <8 x i32> -; GATHER-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[RDX_SHUF]] -; GATHER-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; GATHER-NEXT: [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; GATHER-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; GATHER-NEXT: [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; GATHER-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[TMP8]], -5 +; GATHER-NEXT: [[Rdx:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP7]]) +; GATHER-NEXT: [[BIN_EXTRA]] = add i32 [[Rdx]], -5 ; GATHER-NEXT: [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]] ; GATHER-NEXT: br label [[FOR_BODY]] ;