Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1229,6 +1229,17 @@ TTI::TargetCostKind CostKind, unsigned Index = -1) const; + /// \Return The scalarization cost for this type. Scalarization in this + /// context means the creation of vectors from a group of scalars. If \p + /// NeedToShuffle is true, need to add a cost of reshuffling some of the + /// vector elements. \p VL holds all the values to be inserted which will + /// make up the vector of type \p Ty. Elements not in \p DemandedElts are + /// either constants or duplicated elements. + InstructionCost getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) const; + /// \return The cost of replication shuffle of \p VF elements typed \p EltTy /// \p ReplicationFactor times. /// @@ -1829,6 +1840,10 @@ virtual InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) = 0; + virtual InstructionCost getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) = 0; virtual InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, @@ -2413,6 +2428,12 @@ unsigned Index) override { return Impl.getVectorInstrCost(I, Val, CostKind, Index); } + InstructionCost getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) override { + return Impl.getGatherCost(Ty, DemandedElts, NeedToShuffle, VL); + } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -601,6 +601,13 @@ return 1; } + InstructionCost getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) const { + return NeedToShuffle; + } + unsigned getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1235,6 +1235,31 @@ Op1); } + InstructionCost getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) { + assert(VL.size() == Ty->getNumElements() && "Ty does not match the values."); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Cost = + thisT()->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, + /*Extract*/ false, CostKind); + + // Subtract away the costs added for insertions of values loaded from + // memory. + if (thisT()->supportsEfficientVectorElementLoadStore()) + for (unsigned Idx = 0; Idx < VL.size(); ++Idx) + if (DemandedElts[Idx] && isa(VL[Idx])) + Cost -= thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, + CostKind, Idx, nullptr, nullptr); + + if (NeedToShuffle) + Cost += thisT()->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + Ty, std::nullopt, CostKind, 0, nullptr); + + return Cost; + } + InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -929,6 +929,17 @@ return Cost; } +InstructionCost +TargetTransformInfo::getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) const { + InstructionCost Cost = TTIImpl->getGatherCost(Ty, DemandedElts, + NeedToShuffle, VL); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getReplicationShuffleCost( Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -110,6 +110,8 @@ InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1); + InstructionCost getGatherCost(FixedVectorType *Ty, const APInt &DemandedElts, + bool NeedToShuffle, ArrayRef VL); bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue); InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, @@ -122,6 +124,13 @@ Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false); + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + std::optional FMF, + TTI::TargetCostKind CostKind); + InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind); + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); /// @} Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -1023,6 +1023,36 @@ return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1); } +InstructionCost SystemZTTIImpl::getGatherCost(FixedVectorType *Ty, + const APInt &DemandedElts, + bool NeedToShuffle, + ArrayRef VL) { + if (Ty->isIntOrIntVectorTy(64)) { + // VLVGP will insert two GPRs with one instruction, while VLE will load + // an element directly with no extra cost. Take special care for cases + // where one element is loaded with VLE and the other one still needs an + // insertion. + assert(VL.size() == Ty->getNumElements() && "Ty does not match the values."); + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost Cost = 0; + InstructionCost CurrVectorCost = 0; + for (unsigned Idx = 0; Idx < VL.size(); ++Idx) { + if (DemandedElts[Idx] && !isa(VL[Idx])) + ++CurrVectorCost; + if (Idx % 2 == 1) { + Cost += std::min(InstructionCost(1), CurrVectorCost); + CurrVectorCost = 0; + } + } + if (NeedToShuffle) + Cost += getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty, + std::nullopt, CostKind, 0, nullptr); + return Cost; + } + + return BaseT::getGatherCost(Ty, DemandedElts, NeedToShuffle, VL); +} + // Check if a load may be folded as a memory operand in its user. bool SystemZTTIImpl:: isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) { @@ -1239,6 +1269,43 @@ return NumVectorMemOps + NumPermutes; } +InstructionCost +SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, + std::optional FMF, + TTI::TargetCostKind CostKind) { + if (!TTI::requiresOrderedReduction(FMF) && ST->hasVector()) + switch (Opcode) { + default: break; + case Instruction::FAdd: + case Instruction::FMul: + unsigned NumVectors = getNumVectorRegs(ValTy); + unsigned ScalarBits = ValTy->getScalarSizeInBits(); + if (ScalarBits == 64 || (ScalarBits == 32 && ST->hasVectorEnhancements1())) + return NumVectors; + if (ScalarBits == 128 && ST->hasVectorEnhancements1()) + return NumVectors / 2; // Bonus for the reassociation. + break; + } + + return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind); +} + +InstructionCost +SystemZTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (Ty->isFPOrFPVectorTy() && ST->hasVectorEnhancements1()) { + unsigned NumVectors = getNumVectorRegs(Ty); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + if (ScalarBits == 32 || ScalarBits == 64) + return NumVectors; + if (ScalarBits == 128) + return NumVectors / 2; // Bonus for the reassociation. + } + + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind); +} + static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) { if (RetTy->isVectorTy() && ID == Intrinsic::bswap) return getNumVectorRegs(RetTy); // VPERM Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2440,14 +2440,6 @@ /// for ease of later optimization. Value *createBuildVector(const TreeEntry *E); - /// \returns the scalarization cost for this type. Scalarization in this - /// context means the creation of vectors from a group of scalars. If \p - /// NeedToShuffle is true, need to add a cost of reshuffling some of the - /// vector elements. - InstructionCost getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const; - /// Returns the instruction in the bundle, which can be used as a base point /// for scheduling. Usually it is the last instruction in the bundle, except /// for the case when all operands are external (in this case, it is the first @@ -2467,8 +2459,8 @@ SmallVectorImpl &Entries); /// \returns the scalarization cost for this list of values. Assuming that - /// this subtree gets vectorized, we may need to extract the values from the - /// roots. This method calculates the cost of extracting the values. + /// this subtree gets vectorized, we may need to insert the values from the + /// roots. This method calculates the cost of inserting the values. InstructionCost getGatherCost(ArrayRef VL) const; /// Set the Builder insert point to one after the last instruction in @@ -8579,18 +8571,6 @@ return std::nullopt; } -InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, - const APInt &ShuffledIndices, - bool NeedToShuffle) const { - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - InstructionCost Cost = - TTI->getScalarizationOverhead(Ty, ~ShuffledIndices, /*Insert*/ true, - /*Extract*/ false, CostKind); - if (NeedToShuffle) - Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); - return Cost; -} - InstructionCost BoUpSLP::getGatherCost(ArrayRef VL) const { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); @@ -8598,9 +8578,9 @@ ScalarTy = SI->getValueOperand()->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); bool DuplicateNonConst = false; - // Find the cost of inserting/extracting values from the vector. - // Check if the same elements are inserted several times and count them as - // shuffle candidates. + // Find the cost of inserting values into the vector. Check if the same + // elements are inserted several times and count them as shuffle + // candidates. APInt ShuffledElements = APInt::getZero(VL.size()); DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. @@ -8616,7 +8596,7 @@ ShuffledElements.setBit(Idx); } } - return getGatherCost(VecTy, ShuffledElements, DuplicateNonConst); + return TTI->getGatherCost(VecTy, ~ShuffledElements, DuplicateNonConst, VL); } // Perform operand reordering on the instructions in VL and return the reordered Index: llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/SystemZ/vector-reductions-fp.ll @@ -0,0 +1,310 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -passes='print' -disable-output -mtriple=s390x-unknown-linux \ +; RUN: -mcpu=z13 < %s 2>&1 | FileCheck %s --check-prefix=Z13 +; RUN: opt -passes='print' -disable-output -mtriple=s390x-unknown-linux \ +; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15 + +define void @strict_fadd_reductions() { +; Z13-LABEL: 'strict_fadd_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'strict_fadd_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) + ret void +} + +define void @fast_fadd_reductions() { +; Z13-LABEL: 'fast_fadd_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'fast_fadd_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + + %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + + %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + + %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + + %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) + %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) + + ret void +} + +define void @strict_fmul_reductions() { +; Z13-LABEL: 'strict_fmul_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'strict_fmul_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef) + %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef) + %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef) + %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef) + %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) + ret void +} + +define void @fast_fmul_reductions() { +; Z13-LABEL: 'fast_fmul_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fmul_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %fmul_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'fast_fmul_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fmul_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fmul_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef) + %fmul_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef) + + %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef) + %fmul_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef) + + %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef) + %fmul_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef) + + %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef) + %fmul_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef) + + %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) + %fadd_v4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) + + ret void +} + +define void @strict_fmin_reductions() { +; Z13-LABEL: 'strict_fmin_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'strict_fmin_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) + %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) + %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) + %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) + %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) + ret void +} + +define void @fast_fmin_reductions() { +; Z13-LABEL: 'fast_fmin_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128 = call fast fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'fast_fmin_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128 = call fast fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) + %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) + + %V8f32 = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) + %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) + + %V2f64 = call fast double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) + %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) + + %V4f64 = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) + %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) + + %V4f128 = call fast fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) + %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) + + ret void +} + +define void @strict_fmax_reductions() { +; Z13-LABEL: 'strict_fmax_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'strict_fmax_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) + %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) + %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) + %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) + %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) + ret void +} + +define void @fast_fmax_reductions() { +; Z13-LABEL: 'fast_fmax_reductions' +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128 = call fast fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z13-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; +; Z15-LABEL: 'fast_fmax_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128 = call fast fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) + %V4f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) + + %V8f32 = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) + %V8f32_reassoc = call reassoc float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) + + %V2f64 = call fast double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) + %V2f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) + + %V4f64 = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) + %V4f64_reassoc = call reassoc double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) + + %V4f128 = call fast fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) + %V4f128_reassoc = call reassoc fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) + + ret void +} + + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) +declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>) + +declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) +declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>) + +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>) + +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>) Index: llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll @@ -0,0 +1,981 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z13 -O3 -o - | FileCheck %s --check-prefix=Z13 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -o - | FileCheck %s --check-prefix=Z15 +; +; Test vectorization and reassociation of fadd operations. Fsubs are +; converted to fadds by InstCombiner so they do not need any separate +; handling. + +define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_4_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_4_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + ret double %add5 +} + +define double @fadd_double_6_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_6_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: adb %f0, 40(%r2) +; Z13-NEXT: adb %f0, 48(%r2) +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_6_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: adb %f0, 40(%r2) +; Z15-NEXT: adb %f0, 48(%r2) +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + ret double %add9 +} + +define double @fadd_double_8_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_8_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 0(%r2), 3 +; Z13-NEXT: vl %v1, 16(%r2), 3 +; Z13-NEXT: vl %v2, 40(%r2), 3 +; Z13-NEXT: vl %v3, 56(%r2), 3 +; Z13-NEXT: vfadb %v1, %v1, %v3 +; Z13-NEXT: vfadb %v0, %v0, %v2 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_8_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2), 3 +; Z15-NEXT: vl %v1, 16(%r2), 3 +; Z15-NEXT: vl %v2, 40(%r2), 3 +; Z15-NEXT: vl %v3, 56(%r2), 3 +; Z15-NEXT: vfadb %v1, %v1, %v3 +; Z15-NEXT: vfadb %v0, %v0, %v2 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 7 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 8 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define double @fadd_double_16_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_16_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vl %v2, 88(%r2), 3 +; Z13-NEXT: vl %v3, 56(%r2), 3 +; Z13-NEXT: vl %v4, 72(%r2), 3 +; Z13-NEXT: vl %v5, 40(%r2), 3 +; Z13-NEXT: vl %v6, 120(%r2), 3 +; Z13-NEXT: vl %v7, 104(%r2), 3 +; Z13-NEXT: vfadb %v5, %v5, %v7 +; Z13-NEXT: vfadb %v1, %v1, %v4 +; Z13-NEXT: vfadb %v3, %v3, %v6 +; Z13-NEXT: vfadb %v0, %v0, %v2 +; Z13-NEXT: vfadb %v0, %v0, %v3 +; Z13-NEXT: vfadb %v1, %v1, %v5 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_16_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vl %v2, 88(%r2), 3 +; Z15-NEXT: vl %v3, 56(%r2), 3 +; Z15-NEXT: vl %v4, 72(%r2), 3 +; Z15-NEXT: vl %v5, 40(%r2), 3 +; Z15-NEXT: vl %v6, 120(%r2), 3 +; Z15-NEXT: vl %v7, 104(%r2), 3 +; Z15-NEXT: vfadb %v5, %v5, %v7 +; Z15-NEXT: vfadb %v1, %v1, %v4 +; Z15-NEXT: vfadb %v3, %v3, %v6 +; Z15-NEXT: vfadb %v0, %v0, %v2 +; Z15-NEXT: vfadb %v0, %v0, %v3 +; Z15-NEXT: vfadb %v1, %v1, %v5 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 7 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 8 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + %arrayidx14 = getelementptr inbounds double, ptr %x, i64 9 + %8 = load double, ptr %arrayidx14, align 8 + %add15 = fadd reassoc nsz arcp contract afn double %add13, %8 + %arrayidx16 = getelementptr inbounds double, ptr %x, i64 10 + %9 = load double, ptr %arrayidx16, align 8 + %add17 = fadd reassoc nsz arcp contract afn double %add15, %9 + %arrayidx18 = getelementptr inbounds double, ptr %x, i64 11 + %10 = load double, ptr %arrayidx18, align 8 + %add19 = fadd reassoc nsz arcp contract afn double %add17, %10 + %arrayidx20 = getelementptr inbounds double, ptr %x, i64 12 + %11 = load double, ptr %arrayidx20, align 8 + %add21 = fadd reassoc nsz arcp contract afn double %add19, %11 + %arrayidx22 = getelementptr inbounds double, ptr %x, i64 13 + %12 = load double, ptr %arrayidx22, align 8 + %add23 = fadd reassoc nsz arcp contract afn double %add21, %12 + %arrayidx24 = getelementptr inbounds double, ptr %x, i64 14 + %13 = load double, ptr %arrayidx24, align 8 + %add25 = fadd reassoc nsz arcp contract afn double %add23, %13 + %arrayidx26 = getelementptr inbounds double, ptr %x, i64 15 + %14 = load double, ptr %arrayidx26, align 8 + %add27 = fadd reassoc nsz arcp contract afn double %add25, %14 + %arrayidx28 = getelementptr inbounds double, ptr %x, i64 16 + %15 = load double, ptr %arrayidx28, align 8 + %add29 = fadd reassoc nsz arcp contract afn double %add27, %15 + ret double %add29 +} + +define double @fadd_double_4_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_4_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 32(%r2) +; Z13-NEXT: vleg %v0, 48(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_4_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 32(%r2) +; Z15-NEXT: vleg %v0, 48(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + ret double %add5 +} + +define double @fadd_double_6_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_6_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 32(%r2) +; Z13-NEXT: vleg %v0, 48(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: adb %f0, 64(%r2) +; Z13-NEXT: adb %f0, 80(%r2) +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_6_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 32(%r2) +; Z15-NEXT: vleg %v0, 48(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: adb %f0, 64(%r2) +; Z15-NEXT: adb %f0, 80(%r2) +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + ret double %add9 +} + +define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_8_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 64(%r2) +; Z13-NEXT: vleg %v0, 80(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vlrepg %v2, 96(%r2) +; Z13-NEXT: vleg %v2, 112(%r2), 1 +; Z13-NEXT: vlrepg %v3, 32(%r2) +; Z13-NEXT: vleg %v3, 48(%r2), 1 +; Z13-NEXT: vfadb %v2, %v3, %v2 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vfadb %v0, %v0, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_8_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 64(%r2) +; Z15-NEXT: vleg %v0, 80(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vlrepg %v2, 96(%r2) +; Z15-NEXT: vleg %v2, 112(%r2), 1 +; Z15-NEXT: vlrepg %v3, 32(%r2) +; Z15-NEXT: vleg %v3, 48(%r2), 1 +; Z15-NEXT: vfadb %v2, %v3, %v2 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vfadb %v0, %v0, %v2 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define double @fadd_double_16_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_double_16_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 160(%r2) +; Z13-NEXT: vleg %v0, 176(%r2), 1 +; Z13-NEXT: vlrepg %v1, 32(%r2) +; Z13-NEXT: vleg %v1, 48(%r2), 1 +; Z13-NEXT: vlrepg %v2, 224(%r2) +; Z13-NEXT: vleg %v2, 240(%r2), 1 +; Z13-NEXT: vlrepg %v3, 96(%r2) +; Z13-NEXT: vleg %v3, 112(%r2), 1 +; Z13-NEXT: vlrepg %v4, 128(%r2) +; Z13-NEXT: vleg %v4, 144(%r2), 1 +; Z13-NEXT: vlrepg %v5, 0(%r2) +; Z13-NEXT: vleg %v5, 16(%r2), 1 +; Z13-NEXT: vlrepg %v6, 192(%r2) +; Z13-NEXT: vfadb %v4, %v5, %v4 +; Z13-NEXT: vfadb %v2, %v3, %v2 +; Z13-NEXT: vleg %v6, 208(%r2), 1 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vfadb %v0, %v0, %v2 +; Z13-NEXT: vlrepg %v7, 64(%r2) +; Z13-NEXT: vleg %v7, 80(%r2), 1 +; Z13-NEXT: vfadb %v6, %v7, %v6 +; Z13-NEXT: vfadb %v1, %v4, %v6 +; Z13-NEXT: vfadb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfadb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_double_16_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 160(%r2) +; Z15-NEXT: vleg %v0, 176(%r2), 1 +; Z15-NEXT: vlrepg %v1, 32(%r2) +; Z15-NEXT: vleg %v1, 48(%r2), 1 +; Z15-NEXT: vlrepg %v2, 224(%r2) +; Z15-NEXT: vleg %v2, 240(%r2), 1 +; Z15-NEXT: vlrepg %v3, 96(%r2) +; Z15-NEXT: vleg %v3, 112(%r2), 1 +; Z15-NEXT: vlrepg %v4, 128(%r2) +; Z15-NEXT: vleg %v4, 144(%r2), 1 +; Z15-NEXT: vlrepg %v5, 0(%r2) +; Z15-NEXT: vleg %v5, 16(%r2), 1 +; Z15-NEXT: vlrepg %v6, 192(%r2) +; Z15-NEXT: vfadb %v4, %v5, %v4 +; Z15-NEXT: vfadb %v2, %v3, %v2 +; Z15-NEXT: vleg %v6, 208(%r2), 1 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vfadb %v0, %v0, %v2 +; Z15-NEXT: vlrepg %v7, 64(%r2) +; Z15-NEXT: vleg %v7, 80(%r2), 1 +; Z15-NEXT: vfadb %v6, %v7, %v6 +; Z15-NEXT: vfadb %v1, %v4, %v6 +; Z15-NEXT: vfadb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfadb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + %arrayidx14 = getelementptr inbounds double, ptr %x, i64 16 + %8 = load double, ptr %arrayidx14, align 8 + %add15 = fadd reassoc nsz arcp contract afn double %add13, %8 + %arrayidx16 = getelementptr inbounds double, ptr %x, i64 18 + %9 = load double, ptr %arrayidx16, align 8 + %add17 = fadd reassoc nsz arcp contract afn double %add15, %9 + %arrayidx18 = getelementptr inbounds double, ptr %x, i64 20 + %10 = load double, ptr %arrayidx18, align 8 + %add19 = fadd reassoc nsz arcp contract afn double %add17, %10 + %arrayidx20 = getelementptr inbounds double, ptr %x, i64 22 + %11 = load double, ptr %arrayidx20, align 8 + %add21 = fadd reassoc nsz arcp contract afn double %add19, %11 + %arrayidx22 = getelementptr inbounds double, ptr %x, i64 24 + %12 = load double, ptr %arrayidx22, align 8 + %add23 = fadd reassoc nsz arcp contract afn double %add21, %12 + %arrayidx24 = getelementptr inbounds double, ptr %x, i64 26 + %13 = load double, ptr %arrayidx24, align 8 + %add25 = fadd reassoc nsz arcp contract afn double %add23, %13 + %arrayidx26 = getelementptr inbounds double, ptr %x, i64 28 + %14 = load double, ptr %arrayidx26, align 8 + %add27 = fadd reassoc nsz arcp contract afn double %add25, %14 + %arrayidx28 = getelementptr inbounds double, ptr %x, i64 30 + %15 = load double, ptr %arrayidx28, align 8 + %add29 = fadd reassoc nsz arcp contract afn double %add27, %15 + ret double %add29 +} + +define float @fadd_float_4_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_4_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 12(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_4_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + ret float %add5 +} + +define float @fadd_float_6_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_6_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 12(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 20(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_6_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: aeb %f0, 16(%r2) +; Z15-NEXT: aeb %f0, 20(%r2) +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + ret float %add9 +} + +define float @fadd_float_8_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_8_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 12(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 20(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: aeb %f0, 28(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_8_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vfasb %v0, %v1, %v0 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 4 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 4 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + ret float %add13 +} + +define float @fadd_float_16_addends_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_16_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 12(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 20(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: aeb %f0, 28(%r2) +; Z13-NEXT: aeb %f0, 32(%r2) +; Z13-NEXT: aeb %f0, 36(%r2) +; Z13-NEXT: aeb %f0, 40(%r2) +; Z13-NEXT: aeb %f0, 44(%r2) +; Z13-NEXT: aeb %f0, 48(%r2) +; Z13-NEXT: aeb %f0, 52(%r2) +; Z13-NEXT: aeb %f0, 56(%r2) +; Z13-NEXT: aeb %f0, 60(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_16_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 32(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vl %v2, 48(%r2) +; Z15-NEXT: vl %v3, 16(%r2) +; Z15-NEXT: vfasb %v2, %v3, %v2 +; Z15-NEXT: vfasb %v0, %v1, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v2 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 4 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 4 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 8 + %8 = load float, ptr %arrayidx14, align 4 + %add15 = fadd reassoc nsz arcp contract afn float %add13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 9 + %9 = load float, ptr %arrayidx16, align 4 + %add17 = fadd reassoc nsz arcp contract afn float %add15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 10 + %10 = load float, ptr %arrayidx18, align 4 + %add19 = fadd reassoc nsz arcp contract afn float %add17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 11 + %11 = load float, ptr %arrayidx20, align 4 + %add21 = fadd reassoc nsz arcp contract afn float %add19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 12 + %12 = load float, ptr %arrayidx22, align 4 + %add23 = fadd reassoc nsz arcp contract afn float %add21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 13 + %13 = load float, ptr %arrayidx24, align 4 + %add25 = fadd reassoc nsz arcp contract afn float %add23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 14 + %14 = load float, ptr %arrayidx26, align 4 + %add27 = fadd reassoc nsz arcp contract afn float %add25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 15 + %15 = load float, ptr %arrayidx28, align 4 + %add29 = fadd reassoc nsz arcp contract afn float %add27, %15 + ret float %add29 +} + +define float @fadd_float_4_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_4_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_4_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 0(%r2) +; Z15-NEXT: vlef %v0, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 24(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + ret float %add5 +} + +define float @fadd_float_6_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_6_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: aeb %f0, 32(%r2) +; Z13-NEXT: aeb %f0, 40(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_6_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 0(%r2) +; Z15-NEXT: vlef %v0, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 24(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: aeb %f0, 32(%r2) +; Z15-NEXT: aeb %f0, 40(%r2) +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + ret float %add9 +} + +define float @fadd_float_8_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_8_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: aeb %f0, 32(%r2) +; Z13-NEXT: aeb %f0, 40(%r2) +; Z13-NEXT: aeb %f0, 48(%r2) +; Z13-NEXT: aeb %f0, 56(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_8_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 32(%r2) +; Z15-NEXT: vlef %v0, 40(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 48(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 56(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vfasb %v0, %v1, %v0 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + ret float %add13 +} + +define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fadd_float_16_addends_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: aeb %f0, 0(%r2) +; Z13-NEXT: aeb %f0, 16(%r2) +; Z13-NEXT: aeb %f0, 24(%r2) +; Z13-NEXT: aeb %f0, 32(%r2) +; Z13-NEXT: aeb %f0, 40(%r2) +; Z13-NEXT: aeb %f0, 48(%r2) +; Z13-NEXT: aeb %f0, 56(%r2) +; Z13-NEXT: aeb %f0, 64(%r2) +; Z13-NEXT: aeb %f0, 72(%r2) +; Z13-NEXT: aeb %f0, 80(%r2) +; Z13-NEXT: aeb %f0, 88(%r2) +; Z13-NEXT: aeb %f0, 96(%r2) +; Z13-NEXT: aeb %f0, 104(%r2) +; Z13-NEXT: aeb %f0, 112(%r2) +; Z13-NEXT: aeb %f0, 120(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fadd_float_16_addends_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 64(%r2) +; Z15-NEXT: vlef %v0, 72(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlrepf %v2, 96(%r2) +; Z15-NEXT: vlef %v2, 104(%r2), 1 +; Z15-NEXT: vlrepf %v3, 32(%r2) +; Z15-NEXT: vlef %v3, 40(%r2), 1 +; Z15-NEXT: vlef %v0, 80(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v2, 112(%r2), 2 +; Z15-NEXT: vlef %v3, 48(%r2), 2 +; Z15-NEXT: vlef %v0, 88(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vfasb %v0, %v1, %v0 +; Z15-NEXT: vlef %v2, 120(%r2), 3 +; Z15-NEXT: vlef %v3, 56(%r2), 3 +; Z15-NEXT: vfasb %v2, %v3, %v2 +; Z15-NEXT: vfasb %v0, %v0, %v2 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfasb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16 + %8 = load float, ptr %arrayidx14, align 4 + %add15 = fadd reassoc nsz arcp contract afn float %add13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18 + %9 = load float, ptr %arrayidx16, align 4 + %add17 = fadd reassoc nsz arcp contract afn float %add15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20 + %10 = load float, ptr %arrayidx18, align 4 + %add19 = fadd reassoc nsz arcp contract afn float %add17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22 + %11 = load float, ptr %arrayidx20, align 4 + %add21 = fadd reassoc nsz arcp contract afn float %add19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24 + %12 = load float, ptr %arrayidx22, align 4 + %add23 = fadd reassoc nsz arcp contract afn float %add21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26 + %13 = load float, ptr %arrayidx24, align 4 + %add25 = fadd reassoc nsz arcp contract afn float %add23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28 + %14 = load float, ptr %arrayidx26, align 4 + %add27 = fadd reassoc nsz arcp contract afn float %add25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30 + %15 = load float, ptr %arrayidx28, align 4 + %add29 = fadd reassoc nsz arcp contract afn float %add27, %15 + ret float %add29 +} + +define void @faddfp128_4_addends_seq(ptr noalias nocapture writeonly sret(fp128) align 8 %agg.result, ptr nocapture noundef readonly %x) { +; Z13-LABEL: faddfp128_4_addends_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: ld %f0, 0(%r3) +; Z13-NEXT: ld %f2, 8(%r3) +; Z13-NEXT: ld %f1, 16(%r3) +; Z13-NEXT: ld %f3, 24(%r3) +; Z13-NEXT: axbr %f1, %f0 +; Z13-NEXT: ld %f0, 32(%r3) +; Z13-NEXT: ld %f2, 40(%r3) +; Z13-NEXT: axbr %f0, %f1 +; Z13-NEXT: ld %f1, 48(%r3) +; Z13-NEXT: ld %f3, 56(%r3) +; Z13-NEXT: axbr %f1, %f0 +; Z13-NEXT: std %f1, 0(%r2) +; Z13-NEXT: std %f3, 8(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: faddfp128_4_addends_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 32(%r3), 3 +; Z15-NEXT: vl %v1, 0(%r3), 3 +; Z15-NEXT: vl %v2, 48(%r3), 3 +; Z15-NEXT: vl %v3, 16(%r3), 3 +; Z15-NEXT: wfaxb %v2, %v3, %v2 +; Z15-NEXT: wfaxb %v0, %v1, %v0 +; Z15-NEXT: wfaxb %v0, %v0, %v2 +; Z15-NEXT: vst %v0, 0(%r2), 3 +; Z15-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn fp128 %1, %0 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3 + store fp128 %add5, ptr %agg.result, align 8 + ret void +} Index: llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll @@ -0,0 +1,916 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z13 -O3 -o - | FileCheck %s --check-prefix=Z13 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -o - | FileCheck %s --check-prefix=Z15 + +define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmin_double_4_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: vl %v0, 16(%r2) +; Z13-NEXT: vl %v1, 0(%r2) +; Z13-NEXT: vfchedb %v2, %v1, %v0 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfchedb %v2, %v0, %v1 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmin_double_4_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 16(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vfmindb %v0, %v1, %v0, 4 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmindb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds double, ptr %x, i64 1 + %g2 = getelementptr inbounds double, ptr %x, i64 2 + %g3 = getelementptr inbounds double, ptr %x, i64 3 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2) + ret double %m3 +} + +define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmin_double_16_nums_nonseq: +; Z13: # %bb.0: +; Z13-NEXT: vlrepg %v0, 224(%r2) +; Z13-NEXT: vleg %v0, 240(%r2), 1 +; Z13-NEXT: vlrepg %v1, 96(%r2) +; Z13-NEXT: vleg %v1, 112(%r2), 1 +; Z13-NEXT: vlrepg %v2, 160(%r2) +; Z13-NEXT: vleg %v2, 176(%r2), 1 +; Z13-NEXT: vlrepg %v3, 32(%r2) +; Z13-NEXT: vleg %v3, 48(%r2), 1 +; Z13-NEXT: vlrepg %v4, 192(%r2) +; Z13-NEXT: vleg %v4, 208(%r2), 1 +; Z13-NEXT: vlrepg %v5, 64(%r2) +; Z13-NEXT: vleg %v5, 80(%r2), 1 +; Z13-NEXT: vlrepg %v6, 128(%r2) +; Z13-NEXT: vfchedb %v17, %v5, %v4 +; Z13-NEXT: vfchedb %v18, %v3, %v2 +; Z13-NEXT: vleg %v6, 144(%r2), 1 +; Z13-NEXT: vfchedb %v19, %v1, %v0 +; Z13-NEXT: vlrepg %v7, 0(%r2) +; Z13-NEXT: vsel %v0, %v0, %v1, %v19 +; Z13-NEXT: vsel %v1, %v2, %v3, %v18 +; Z13-NEXT: vleg %v7, 16(%r2), 1 +; Z13-NEXT: vfchedb %v16, %v7, %v6 +; Z13-NEXT: vsel %v2, %v4, %v5, %v17 +; Z13-NEXT: vsel %v3, %v6, %v7, %v16 +; Z13-NEXT: vfchedb %v4, %v3, %v2 +; Z13-NEXT: vfchedb %v5, %v1, %v0 +; Z13-NEXT: vsel %v0, %v0, %v1, %v5 +; Z13-NEXT: vsel %v1, %v2, %v3, %v4 +; Z13-NEXT: vfchedb %v2, %v1, %v0 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfchedb %v2, %v0, %v1 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmin_double_16_nums_nonseq: +; Z15: # %bb.0: +; Z15-NEXT: vlrepg %v0, 160(%r2) +; Z15-NEXT: vleg %v0, 176(%r2), 1 +; Z15-NEXT: vlrepg %v1, 32(%r2) +; Z15-NEXT: vleg %v1, 48(%r2), 1 +; Z15-NEXT: vlrepg %v2, 224(%r2) +; Z15-NEXT: vleg %v2, 240(%r2), 1 +; Z15-NEXT: vlrepg %v3, 96(%r2) +; Z15-NEXT: vleg %v3, 112(%r2), 1 +; Z15-NEXT: vlrepg %v4, 128(%r2) +; Z15-NEXT: vleg %v4, 144(%r2), 1 +; Z15-NEXT: vlrepg %v5, 0(%r2) +; Z15-NEXT: vleg %v5, 16(%r2), 1 +; Z15-NEXT: vlrepg %v6, 192(%r2) +; Z15-NEXT: vfmindb %v4, %v5, %v4, 4 +; Z15-NEXT: vfmindb %v2, %v3, %v2, 4 +; Z15-NEXT: vleg %v6, 208(%r2), 1 +; Z15-NEXT: vfmindb %v0, %v1, %v0, 4 +; Z15-NEXT: vfmindb %v0, %v0, %v2, 4 +; Z15-NEXT: vlrepg %v7, 64(%r2) +; Z15-NEXT: vleg %v7, 80(%r2), 1 +; Z15-NEXT: vfmindb %v6, %v7, %v6, 4 +; Z15-NEXT: vfmindb %v1, %v4, %v6, 4 +; Z15-NEXT: vfmindb %v0, %v1, %v0, 4 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmindb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds double, ptr %x, i64 2 + %g2 = getelementptr inbounds double, ptr %x, i64 4 + %g3 = getelementptr inbounds double, ptr %x, i64 6 + %g4 = getelementptr inbounds double, ptr %x, i64 8 + %g5 = getelementptr inbounds double, ptr %x, i64 10 + %g6 = getelementptr inbounds double, ptr %x, i64 12 + %g7 = getelementptr inbounds double, ptr %x, i64 14 + %g8 = getelementptr inbounds double, ptr %x, i64 16 + %g9 = getelementptr inbounds double, ptr %x, i64 18 + %g10 = getelementptr inbounds double, ptr %x, i64 20 + %g11 = getelementptr inbounds double, ptr %x, i64 22 + %g12 = getelementptr inbounds double, ptr %x, i64 24 + %g13 = getelementptr inbounds double, ptr %x, i64 26 + %g14 = getelementptr inbounds double, ptr %x, i64 28 + %g15 = getelementptr inbounds double, ptr %x, i64 30 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %t4 = load double, ptr %g4, align 4 + %t5 = load double, ptr %g5, align 4 + %t6 = load double, ptr %g6, align 4 + %t7 = load double, ptr %g7, align 4 + %t8 = load double, ptr %g8, align 4 + %t9 = load double, ptr %g9, align 4 + %t10 = load double, ptr %g10, align 4 + %t11 = load double, ptr %g11, align 4 + %t12 = load double, ptr %g12, align 4 + %t13 = load double, ptr %g13, align 4 + %t14 = load double, ptr %g14, align 4 + %t15 = load double, ptr %g15, align 4 + %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2) + %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3) + %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4) + %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5) + %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6) + %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7) + %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8) + %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9) + %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10) + %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11) + %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12) + %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13) + %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14) + ret double %m15 +} + +define float @fmin_float_8_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmin_float_8_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: vl %v0, 0(%r2) +; Z13-NEXT: vl %v1, 16(%r2) +; Z13-NEXT: vmrlf %v2, %v1, %v1 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vldeb %v2, %v2 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchedb %v2, %v3, %v2 +; Z13-NEXT: vmrhf %v3, %v1, %v1 +; Z13-NEXT: vmrhf %v4, %v0, %v0 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vldeb %v4, %v4 +; Z13-NEXT: vfchedb %v3, %v4, %v3 +; Z13-NEXT: vpkg %v2, %v3, %v2 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: vmrlg %v1, %v0, %v0 +; Z13-NEXT: vmrhf %v2, %v0, %v0 +; Z13-NEXT: vmrhf %v3, %v1, %v1 +; Z13-NEXT: vldeb %v2, %v2 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchedb %v2, %v2, %v3 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vldeb %v4, %v0 +; Z13-NEXT: vfchedb %v3, %v3, %v4 +; Z13-NEXT: vpkg %v2, %v2, %v3 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: vrepf %v1, %v0, 1 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vmrhf %v4, %v0, %v0 +; Z13-NEXT: vldeb %v2, %v1 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchedb %v3, %v3, %v2 +; Z13-NEXT: vldeb %v4, %v4 +; Z13-NEXT: vfchedb %v2, %v4, %v2 +; Z13-NEXT: vpkg %v2, %v2, %v3 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmin_float_8_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 16(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vfminsb %v0, %v1, %v0, 4 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfminsb %v0, %v0, %v1, 4 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfminsb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds float, ptr %x, i64 1 + %g2 = getelementptr inbounds float, ptr %x, i64 2 + %g3 = getelementptr inbounds float, ptr %x, i64 3 + %g4 = getelementptr inbounds float, ptr %x, i64 4 + %g5 = getelementptr inbounds float, ptr %x, i64 5 + %g6 = getelementptr inbounds float, ptr %x, i64 6 + %g7 = getelementptr inbounds float, ptr %x, i64 7 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6) + ret float %m7 +} + +define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmin_float_12_nums_nonseq: +; Z13: # %bb.0: +; Z13-NEXT: lde %f1, 0(%r2) +; Z13-NEXT: lde %f2, 8(%r2) +; Z13-NEXT: lde %f0, 16(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnl .LBB3_13 +; Z13-NEXT: # %bb.1: +; Z13-NEXT: lde %f1, 24(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnl .LBB3_14 +; Z13-NEXT: .LBB3_2: +; Z13-NEXT: lde %f2, 32(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnl .LBB3_15 +; Z13-NEXT: .LBB3_3: +; Z13-NEXT: lde %f0, 40(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnl .LBB3_16 +; Z13-NEXT: .LBB3_4: +; Z13-NEXT: lde %f1, 48(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnl .LBB3_17 +; Z13-NEXT: .LBB3_5: +; Z13-NEXT: lde %f2, 56(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnl .LBB3_18 +; Z13-NEXT: .LBB3_6: +; Z13-NEXT: lde %f0, 64(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnl .LBB3_19 +; Z13-NEXT: .LBB3_7: +; Z13-NEXT: lde %f1, 72(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnl .LBB3_20 +; Z13-NEXT: .LBB3_8: +; Z13-NEXT: lde %f2, 80(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnl .LBB3_21 +; Z13-NEXT: .LBB3_9: +; Z13-NEXT: lde %f0, 88(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jl .LBB3_11 +; Z13-NEXT: .LBB3_10: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: .LBB3_11: +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: blr %r14 +; Z13-NEXT: .LBB3_12: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: br %r14 +; Z13-NEXT: .LBB3_13: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 24(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jl .LBB3_2 +; Z13-NEXT: .LBB3_14: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 32(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jl .LBB3_3 +; Z13-NEXT: .LBB3_15: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 40(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jl .LBB3_4 +; Z13-NEXT: .LBB3_16: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 48(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jl .LBB3_5 +; Z13-NEXT: .LBB3_17: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 56(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jl .LBB3_6 +; Z13-NEXT: .LBB3_18: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 64(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jl .LBB3_7 +; Z13-NEXT: .LBB3_19: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 72(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jl .LBB3_8 +; Z13-NEXT: .LBB3_20: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 80(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jl .LBB3_9 +; Z13-NEXT: .LBB3_21: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 88(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnl .LBB3_10 +; Z13-NEXT: j .LBB3_11 +; +; Z15-LABEL: fmin_float_12_nums_nonseq: +; Z15: # %bb.0: +; Z15-NEXT: vlrepf %v0, 32(%r2) +; Z15-NEXT: vlef %v0, 40(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 48(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 56(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vlrepf %v2, 64(%r2) +; Z15-NEXT: vlef %v2, 72(%r2), 1 +; Z15-NEXT: vlef %v2, 80(%r2), 2 +; Z15-NEXT: vfminsb %v0, %v1, %v0, 4 +; Z15-NEXT: vlef %v2, 88(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfminsb %v0, %v0, %v1, 4 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfminsb %v0, %v0, %v1, 4 +; Z15-NEXT: vmrlg %v1, %v2, %v2 +; Z15-NEXT: vfminsb %v1, %v2, %v1, 4 +; Z15-NEXT: vrepf %v2, %v1, 1 +; Z15-NEXT: vfminsb %v1, %v1, %v2, 4 +; Z15-NEXT: wfminsb %f0, %f0, %f1, 4 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds float, ptr %x, i64 2 + %g2 = getelementptr inbounds float, ptr %x, i64 4 + %g3 = getelementptr inbounds float, ptr %x, i64 6 + %g4 = getelementptr inbounds float, ptr %x, i64 8 + %g5 = getelementptr inbounds float, ptr %x, i64 10 + %g6 = getelementptr inbounds float, ptr %x, i64 12 + %g7 = getelementptr inbounds float, ptr %x, i64 14 + %g8 = getelementptr inbounds float, ptr %x, i64 16 + %g9 = getelementptr inbounds float, ptr %x, i64 18 + %g10 = getelementptr inbounds float, ptr %x, i64 20 + %g11 = getelementptr inbounds float, ptr %x, i64 22 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %t8 = load float, ptr %g8, align 4 + %t9 = load float, ptr %g9, align 4 + %t10 = load float, ptr %g10, align 4 + %t11 = load float, ptr %g11, align 4 + %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6) + %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7) + %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8) + %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9) + %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10) + ret float %m11 +} + +define fp128 @fmin_fp128_4_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmin_fp128_4_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: ld %f5, 48(%r3) +; Z13-NEXT: ld %f7, 56(%r3) +; Z13-NEXT: ld %f1, 16(%r3) +; Z13-NEXT: ld %f3, 24(%r3) +; Z13-NEXT: ld %f4, 32(%r3) +; Z13-NEXT: ld %f6, 40(%r3) +; Z13-NEXT: cxbr %f1, %f5 +; Z13-NEXT: ld %f0, 0(%r3) +; Z13-NEXT: ld %f2, 8(%r3) +; Z13-NEXT: jnl .LBB4_4 +; Z13-NEXT: # %bb.1: +; Z13-NEXT: cxbr %f0, %f4 +; Z13-NEXT: jnl .LBB4_5 +; Z13-NEXT: .LBB4_2: +; Z13-NEXT: cxbr %f0, %f1 +; Z13-NEXT: jnl .LBB4_6 +; Z13-NEXT: .LBB4_3: +; Z13-NEXT: std %f0, 0(%r2) +; Z13-NEXT: std %f2, 8(%r2) +; Z13-NEXT: br %r14 +; Z13-NEXT: .LBB4_4: +; Z13-NEXT: cxbr %f0, %f4 +; Z13-NEXT: lxr %f1, %f5 +; Z13-NEXT: jl .LBB4_2 +; Z13-NEXT: .LBB4_5: +; Z13-NEXT: lxr %f0, %f4 +; Z13-NEXT: cxbr %f0, %f1 +; Z13-NEXT: jl .LBB4_3 +; Z13-NEXT: .LBB4_6: +; Z13-NEXT: lxr %f0, %f1 +; Z13-NEXT: std %f0, 0(%r2) +; Z13-NEXT: std %f2, 8(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmin_fp128_4_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 32(%r3) +; Z15-NEXT: vl %v1, 0(%r3) +; Z15-NEXT: vl %v2, 48(%r3) +; Z15-NEXT: vl %v3, 16(%r3) +; Z15-NEXT: wfminxb %v2, %v3, %v2, 4 +; Z15-NEXT: wfminxb %v0, %v1, %v0, 4 +; Z15-NEXT: wfminxb %v0, %v0, %v2, 4 +; Z15-NEXT: vst %v0, 0(%r2), 3 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds fp128, ptr %x, i64 1 + %g2 = getelementptr inbounds fp128, ptr %x, i64 2 + %g3 = getelementptr inbounds fp128, ptr %x, i64 3 + %t0 = load fp128, ptr %x, align 4 + %t1 = load fp128, ptr %g1, align 4 + %t2 = load fp128, ptr %g2, align 4 + %t3 = load fp128, ptr %g3, align 4 + %m1 = tail call fast fp128 @llvm.minnum.f128(fp128 %t1, fp128 %t0) + %m2 = tail call fast fp128 @llvm.minnum.f128(fp128 %t2, fp128 %m1) + %m3 = tail call fast fp128 @llvm.minnum.f128(fp128 %t3, fp128 %m2) + ret fp128 %m3 +} + +define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmax_double_4_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: vl %v0, 16(%r2) +; Z13-NEXT: vl %v1, 0(%r2) +; Z13-NEXT: vfchdb %v2, %v1, %v0 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfchdb %v2, %v0, %v1 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmax_double_4_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 16(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vfmaxdb %v0, %v1, %v0, 4 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmaxdb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds double, ptr %x, i64 1 + %g2 = getelementptr inbounds double, ptr %x, i64 2 + %g3 = getelementptr inbounds double, ptr %x, i64 3 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2) + ret double %m3 +} + +define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmax_double_16_nums_nonseq: +; Z13: # %bb.0: +; Z13-NEXT: vlrepg %v0, 224(%r2) +; Z13-NEXT: vleg %v0, 240(%r2), 1 +; Z13-NEXT: vlrepg %v1, 96(%r2) +; Z13-NEXT: vleg %v1, 112(%r2), 1 +; Z13-NEXT: vlrepg %v2, 160(%r2) +; Z13-NEXT: vleg %v2, 176(%r2), 1 +; Z13-NEXT: vlrepg %v3, 32(%r2) +; Z13-NEXT: vleg %v3, 48(%r2), 1 +; Z13-NEXT: vlrepg %v4, 192(%r2) +; Z13-NEXT: vleg %v4, 208(%r2), 1 +; Z13-NEXT: vlrepg %v5, 64(%r2) +; Z13-NEXT: vleg %v5, 80(%r2), 1 +; Z13-NEXT: vlrepg %v6, 128(%r2) +; Z13-NEXT: vfchdb %v17, %v5, %v4 +; Z13-NEXT: vfchdb %v18, %v3, %v2 +; Z13-NEXT: vleg %v6, 144(%r2), 1 +; Z13-NEXT: vfchdb %v19, %v1, %v0 +; Z13-NEXT: vlrepg %v7, 0(%r2) +; Z13-NEXT: vsel %v0, %v1, %v0, %v19 +; Z13-NEXT: vsel %v1, %v3, %v2, %v18 +; Z13-NEXT: vleg %v7, 16(%r2), 1 +; Z13-NEXT: vfchdb %v16, %v7, %v6 +; Z13-NEXT: vsel %v2, %v5, %v4, %v17 +; Z13-NEXT: vsel %v3, %v7, %v6, %v16 +; Z13-NEXT: vfchdb %v4, %v3, %v2 +; Z13-NEXT: vfchdb %v5, %v1, %v0 +; Z13-NEXT: vsel %v0, %v1, %v0, %v5 +; Z13-NEXT: vsel %v1, %v3, %v2, %v4 +; Z13-NEXT: vfchdb %v2, %v1, %v0 +; Z13-NEXT: vsel %v0, %v1, %v0, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfchdb %v2, %v0, %v1 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmax_double_16_nums_nonseq: +; Z15: # %bb.0: +; Z15-NEXT: vlrepg %v0, 160(%r2) +; Z15-NEXT: vleg %v0, 176(%r2), 1 +; Z15-NEXT: vlrepg %v1, 32(%r2) +; Z15-NEXT: vleg %v1, 48(%r2), 1 +; Z15-NEXT: vlrepg %v2, 224(%r2) +; Z15-NEXT: vleg %v2, 240(%r2), 1 +; Z15-NEXT: vlrepg %v3, 96(%r2) +; Z15-NEXT: vleg %v3, 112(%r2), 1 +; Z15-NEXT: vlrepg %v4, 128(%r2) +; Z15-NEXT: vleg %v4, 144(%r2), 1 +; Z15-NEXT: vlrepg %v5, 0(%r2) +; Z15-NEXT: vleg %v5, 16(%r2), 1 +; Z15-NEXT: vlrepg %v6, 192(%r2) +; Z15-NEXT: vfmaxdb %v4, %v5, %v4, 4 +; Z15-NEXT: vfmaxdb %v2, %v3, %v2, 4 +; Z15-NEXT: vleg %v6, 208(%r2), 1 +; Z15-NEXT: vfmaxdb %v0, %v1, %v0, 4 +; Z15-NEXT: vfmaxdb %v0, %v0, %v2, 4 +; Z15-NEXT: vlrepg %v7, 64(%r2) +; Z15-NEXT: vleg %v7, 80(%r2), 1 +; Z15-NEXT: vfmaxdb %v6, %v7, %v6, 4 +; Z15-NEXT: vfmaxdb %v1, %v4, %v6, 4 +; Z15-NEXT: vfmaxdb %v0, %v1, %v0, 4 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmaxdb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds double, ptr %x, i64 2 + %g2 = getelementptr inbounds double, ptr %x, i64 4 + %g3 = getelementptr inbounds double, ptr %x, i64 6 + %g4 = getelementptr inbounds double, ptr %x, i64 8 + %g5 = getelementptr inbounds double, ptr %x, i64 10 + %g6 = getelementptr inbounds double, ptr %x, i64 12 + %g7 = getelementptr inbounds double, ptr %x, i64 14 + %g8 = getelementptr inbounds double, ptr %x, i64 16 + %g9 = getelementptr inbounds double, ptr %x, i64 18 + %g10 = getelementptr inbounds double, ptr %x, i64 20 + %g11 = getelementptr inbounds double, ptr %x, i64 22 + %g12 = getelementptr inbounds double, ptr %x, i64 24 + %g13 = getelementptr inbounds double, ptr %x, i64 26 + %g14 = getelementptr inbounds double, ptr %x, i64 28 + %g15 = getelementptr inbounds double, ptr %x, i64 30 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %t4 = load double, ptr %g4, align 4 + %t5 = load double, ptr %g5, align 4 + %t6 = load double, ptr %g6, align 4 + %t7 = load double, ptr %g7, align 4 + %t8 = load double, ptr %g8, align 4 + %t9 = load double, ptr %g9, align 4 + %t10 = load double, ptr %g10, align 4 + %t11 = load double, ptr %g11, align 4 + %t12 = load double, ptr %g12, align 4 + %t13 = load double, ptr %g13, align 4 + %t14 = load double, ptr %g14, align 4 + %t15 = load double, ptr %g15, align 4 + %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2) + %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3) + %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4) + %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5) + %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6) + %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7) + %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8) + %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9) + %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10) + %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11) + %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12) + %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13) + %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14) + ret double %m15 +} + +define float @fmax_float_8_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmax_float_8_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: vl %v0, 0(%r2) +; Z13-NEXT: vl %v1, 16(%r2) +; Z13-NEXT: vmrlf %v2, %v1, %v1 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vldeb %v2, %v2 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchdb %v2, %v3, %v2 +; Z13-NEXT: vmrhf %v3, %v1, %v1 +; Z13-NEXT: vmrhf %v4, %v0, %v0 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vldeb %v4, %v4 +; Z13-NEXT: vfchdb %v3, %v4, %v3 +; Z13-NEXT: vpkg %v2, %v3, %v2 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: vmrlg %v1, %v0, %v0 +; Z13-NEXT: vmrhf %v2, %v0, %v0 +; Z13-NEXT: vmrhf %v3, %v1, %v1 +; Z13-NEXT: vldeb %v2, %v2 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchdb %v2, %v2, %v3 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vldeb %v4, %v0 +; Z13-NEXT: vfchdb %v3, %v3, %v4 +; Z13-NEXT: vpkg %v2, %v2, %v3 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: vrepf %v1, %v0, 1 +; Z13-NEXT: vmrlf %v3, %v0, %v0 +; Z13-NEXT: vmrhf %v4, %v0, %v0 +; Z13-NEXT: vldeb %v2, %v1 +; Z13-NEXT: vldeb %v3, %v3 +; Z13-NEXT: vfchdb %v3, %v3, %v2 +; Z13-NEXT: vldeb %v4, %v4 +; Z13-NEXT: vfchdb %v2, %v4, %v2 +; Z13-NEXT: vpkg %v2, %v2, %v3 +; Z13-NEXT: vsel %v0, %v0, %v1, %v2 +; Z13-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmax_float_8_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 16(%r2) +; Z15-NEXT: vl %v1, 0(%r2) +; Z15-NEXT: vfmaxsb %v0, %v1, %v0, 4 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmaxsb %v0, %v0, %v1, 4 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmaxsb %v0, %v0, %v1, 4 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds float, ptr %x, i64 1 + %g2 = getelementptr inbounds float, ptr %x, i64 2 + %g3 = getelementptr inbounds float, ptr %x, i64 3 + %g4 = getelementptr inbounds float, ptr %x, i64 4 + %g5 = getelementptr inbounds float, ptr %x, i64 5 + %g6 = getelementptr inbounds float, ptr %x, i64 6 + %g7 = getelementptr inbounds float, ptr %x, i64 7 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6) + ret float %m7 +} + +define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmax_float_12_nums_nonseq: +; Z13: # %bb.0: +; Z13-NEXT: lde %f1, 0(%r2) +; Z13-NEXT: lde %f2, 8(%r2) +; Z13-NEXT: lde %f0, 16(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnh .LBB8_13 +; Z13-NEXT: # %bb.1: +; Z13-NEXT: lde %f1, 24(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnh .LBB8_14 +; Z13-NEXT: .LBB8_2: +; Z13-NEXT: lde %f2, 32(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnh .LBB8_15 +; Z13-NEXT: .LBB8_3: +; Z13-NEXT: lde %f0, 40(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnh .LBB8_16 +; Z13-NEXT: .LBB8_4: +; Z13-NEXT: lde %f1, 48(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnh .LBB8_17 +; Z13-NEXT: .LBB8_5: +; Z13-NEXT: lde %f2, 56(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnh .LBB8_18 +; Z13-NEXT: .LBB8_6: +; Z13-NEXT: lde %f0, 64(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnh .LBB8_19 +; Z13-NEXT: .LBB8_7: +; Z13-NEXT: lde %f1, 72(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jnh .LBB8_20 +; Z13-NEXT: .LBB8_8: +; Z13-NEXT: lde %f2, 80(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jnh .LBB8_21 +; Z13-NEXT: .LBB8_9: +; Z13-NEXT: lde %f0, 88(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jh .LBB8_11 +; Z13-NEXT: .LBB8_10: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: .LBB8_11: +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: bhr %r14 +; Z13-NEXT: .LBB8_12: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: br %r14 +; Z13-NEXT: .LBB8_13: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 24(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jh .LBB8_2 +; Z13-NEXT: .LBB8_14: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 32(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jh .LBB8_3 +; Z13-NEXT: .LBB8_15: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 40(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jh .LBB8_4 +; Z13-NEXT: .LBB8_16: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 48(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jh .LBB8_5 +; Z13-NEXT: .LBB8_17: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 56(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jh .LBB8_6 +; Z13-NEXT: .LBB8_18: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 64(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jh .LBB8_7 +; Z13-NEXT: .LBB8_19: +; Z13-NEXT: ldr %f2, %f1 +; Z13-NEXT: lde %f1, 72(%r2) +; Z13-NEXT: cebr %f0, %f2 +; Z13-NEXT: jh .LBB8_8 +; Z13-NEXT: .LBB8_20: +; Z13-NEXT: ldr %f0, %f2 +; Z13-NEXT: lde %f2, 80(%r2) +; Z13-NEXT: cebr %f1, %f0 +; Z13-NEXT: jh .LBB8_9 +; Z13-NEXT: .LBB8_21: +; Z13-NEXT: ldr %f1, %f0 +; Z13-NEXT: lde %f0, 88(%r2) +; Z13-NEXT: cebr %f2, %f1 +; Z13-NEXT: jnh .LBB8_10 +; Z13-NEXT: j .LBB8_11 +; +; Z15-LABEL: fmax_float_12_nums_nonseq: +; Z15: # %bb.0: +; Z15-NEXT: vlrepf %v0, 32(%r2) +; Z15-NEXT: vlef %v0, 40(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 48(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 56(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vlrepf %v2, 64(%r2) +; Z15-NEXT: vlef %v2, 72(%r2), 1 +; Z15-NEXT: vlef %v2, 80(%r2), 2 +; Z15-NEXT: vfmaxsb %v0, %v1, %v0, 4 +; Z15-NEXT: vlef %v2, 88(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmaxsb %v0, %v0, %v1, 4 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmaxsb %v0, %v0, %v1, 4 +; Z15-NEXT: vmrlg %v1, %v2, %v2 +; Z15-NEXT: vfmaxsb %v1, %v2, %v1, 4 +; Z15-NEXT: vrepf %v2, %v1, 1 +; Z15-NEXT: vfmaxsb %v1, %v1, %v2, 4 +; Z15-NEXT: wfmaxsb %f0, %f0, %f1, 4 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds float, ptr %x, i64 2 + %g2 = getelementptr inbounds float, ptr %x, i64 4 + %g3 = getelementptr inbounds float, ptr %x, i64 6 + %g4 = getelementptr inbounds float, ptr %x, i64 8 + %g5 = getelementptr inbounds float, ptr %x, i64 10 + %g6 = getelementptr inbounds float, ptr %x, i64 12 + %g7 = getelementptr inbounds float, ptr %x, i64 14 + %g8 = getelementptr inbounds float, ptr %x, i64 16 + %g9 = getelementptr inbounds float, ptr %x, i64 18 + %g10 = getelementptr inbounds float, ptr %x, i64 20 + %g11 = getelementptr inbounds float, ptr %x, i64 22 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %t8 = load float, ptr %g8, align 4 + %t9 = load float, ptr %g9, align 4 + %t10 = load float, ptr %g10, align 4 + %t11 = load float, ptr %g11, align 4 + %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6) + %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7) + %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8) + %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9) + %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10) + ret float %m11 +} + +define fp128 @fmax_fp128_4_nums_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmax_fp128_4_nums_seq: +; Z13: # %bb.0: +; Z13-NEXT: ld %f5, 48(%r3) +; Z13-NEXT: ld %f7, 56(%r3) +; Z13-NEXT: ld %f1, 16(%r3) +; Z13-NEXT: ld %f3, 24(%r3) +; Z13-NEXT: ld %f4, 32(%r3) +; Z13-NEXT: ld %f6, 40(%r3) +; Z13-NEXT: cxbr %f1, %f5 +; Z13-NEXT: ld %f0, 0(%r3) +; Z13-NEXT: ld %f2, 8(%r3) +; Z13-NEXT: jnh .LBB9_4 +; Z13-NEXT: # %bb.1: +; Z13-NEXT: cxbr %f0, %f4 +; Z13-NEXT: jnh .LBB9_5 +; Z13-NEXT: .LBB9_2: +; Z13-NEXT: cxbr %f0, %f1 +; Z13-NEXT: jnh .LBB9_6 +; Z13-NEXT: .LBB9_3: +; Z13-NEXT: std %f0, 0(%r2) +; Z13-NEXT: std %f2, 8(%r2) +; Z13-NEXT: br %r14 +; Z13-NEXT: .LBB9_4: +; Z13-NEXT: cxbr %f0, %f4 +; Z13-NEXT: lxr %f1, %f5 +; Z13-NEXT: jh .LBB9_2 +; Z13-NEXT: .LBB9_5: +; Z13-NEXT: lxr %f0, %f4 +; Z13-NEXT: cxbr %f0, %f1 +; Z13-NEXT: jh .LBB9_3 +; Z13-NEXT: .LBB9_6: +; Z13-NEXT: lxr %f0, %f1 +; Z13-NEXT: std %f0, 0(%r2) +; Z13-NEXT: std %f2, 8(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmax_fp128_4_nums_seq: +; Z15: # %bb.0: +; Z15-NEXT: vl %v0, 32(%r3) +; Z15-NEXT: vl %v1, 0(%r3) +; Z15-NEXT: vl %v2, 48(%r3) +; Z15-NEXT: vl %v3, 16(%r3) +; Z15-NEXT: wfmaxxb %v2, %v3, %v2, 4 +; Z15-NEXT: wfmaxxb %v0, %v1, %v0, 4 +; Z15-NEXT: wfmaxxb %v0, %v0, %v2, 4 +; Z15-NEXT: vst %v0, 0(%r2), 3 +; Z15-NEXT: br %r14 + %g1 = getelementptr inbounds fp128, ptr %x, i64 1 + %g2 = getelementptr inbounds fp128, ptr %x, i64 2 + %g3 = getelementptr inbounds fp128, ptr %x, i64 3 + %t0 = load fp128, ptr %x, align 4 + %t1 = load fp128, ptr %g1, align 4 + %t2 = load fp128, ptr %g2, align 4 + %t3 = load fp128, ptr %g3, align 4 + %m1 = tail call fast fp128 @llvm.maxnum.f128(fp128 %t1, fp128 %t0) + %m2 = tail call fast fp128 @llvm.maxnum.f128(fp128 %t2, fp128 %m1) + %m3 = tail call fast fp128 @llvm.maxnum.f128(fp128 %t3, fp128 %m2) + ret fp128 %m3 +} + +declare float @llvm.minnum.f32(float, float) +declare double @llvm.minnum.f64(double, double) +declare fp128 @llvm.minnum.f128(fp128, fp128) +declare float @llvm.maxnum.f32(float, float) +declare double @llvm.maxnum.f64(double, double) +declare fp128 @llvm.maxnum.f128(fp128, fp128) Index: llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll @@ -0,0 +1,1009 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z13 -O3 -o - | FileCheck %s --check-prefix=Z13 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | llc -mtriple=s390x-linux-gnu -mcpu=z15 -O3 -o - | FileCheck %s --check-prefix=Z15 + +define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_4_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_4_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + ret double %mul5 +} + +define double @fmul_double_6_factors_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_6_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: mdb %f0, 40(%r2) +; Z13-NEXT: mdb %f0, 48(%r2) +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_6_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: mdb %f0, 40(%r2) +; Z15-NEXT: mdb %f0, 48(%r2) +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + ret double %mul9 +} + +define double @fmul_double_8_factors_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_8_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 0(%r2), 3 +; Z13-NEXT: vl %v1, 16(%r2), 3 +; Z13-NEXT: vl %v2, 40(%r2), 3 +; Z13-NEXT: vl %v3, 56(%r2), 3 +; Z13-NEXT: vfmdb %v1, %v1, %v3 +; Z13-NEXT: vfmdb %v0, %v0, %v2 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_8_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2), 3 +; Z15-NEXT: vl %v1, 16(%r2), 3 +; Z15-NEXT: vl %v2, 40(%r2), 3 +; Z15-NEXT: vl %v3, 56(%r2), 3 +; Z15-NEXT: vfmdb %v1, %v1, %v3 +; Z15-NEXT: vfmdb %v0, %v0, %v2 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 7 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 8 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define double @fmul_double_16_factors_seq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_16_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vl %v0, 16(%r2), 3 +; Z13-NEXT: vl %v1, 0(%r2), 3 +; Z13-NEXT: vl %v2, 88(%r2), 3 +; Z13-NEXT: vl %v3, 56(%r2), 3 +; Z13-NEXT: vl %v4, 72(%r2), 3 +; Z13-NEXT: vl %v5, 40(%r2), 3 +; Z13-NEXT: vl %v6, 120(%r2), 3 +; Z13-NEXT: vl %v7, 104(%r2), 3 +; Z13-NEXT: vfmdb %v5, %v5, %v7 +; Z13-NEXT: vfmdb %v1, %v1, %v4 +; Z13-NEXT: vfmdb %v3, %v3, %v6 +; Z13-NEXT: vfmdb %v0, %v0, %v2 +; Z13-NEXT: vfmdb %v0, %v0, %v3 +; Z13-NEXT: vfmdb %v1, %v1, %v5 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_16_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 16(%r2), 3 +; Z15-NEXT: vl %v1, 0(%r2), 3 +; Z15-NEXT: vl %v2, 88(%r2), 3 +; Z15-NEXT: vl %v3, 56(%r2), 3 +; Z15-NEXT: vl %v4, 72(%r2), 3 +; Z15-NEXT: vl %v5, 40(%r2), 3 +; Z15-NEXT: vl %v6, 120(%r2), 3 +; Z15-NEXT: vl %v7, 104(%r2), 3 +; Z15-NEXT: vfmdb %v5, %v5, %v7 +; Z15-NEXT: vfmdb %v1, %v1, %v4 +; Z15-NEXT: vfmdb %v3, %v3, %v6 +; Z15-NEXT: vfmdb %v0, %v0, %v2 +; Z15-NEXT: vfmdb %v0, %v0, %v3 +; Z15-NEXT: vfmdb %v1, %v1, %v5 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 5 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 6 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 7 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 8 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + %arrayidx14 = getelementptr inbounds double, ptr %x, i64 9 + %8 = load double, ptr %arrayidx14, align 8 + %mul15 = fmul reassoc nsz arcp contract afn double %mul13, %8 + %arrayidx16 = getelementptr inbounds double, ptr %x, i64 10 + %9 = load double, ptr %arrayidx16, align 8 + %mul17 = fmul reassoc nsz arcp contract afn double %mul15, %9 + %arrayidx18 = getelementptr inbounds double, ptr %x, i64 11 + %10 = load double, ptr %arrayidx18, align 8 + %mul19 = fmul reassoc nsz arcp contract afn double %mul17, %10 + %arrayidx20 = getelementptr inbounds double, ptr %x, i64 12 + %11 = load double, ptr %arrayidx20, align 8 + %mul21 = fmul reassoc nsz arcp contract afn double %mul19, %11 + %arrayidx22 = getelementptr inbounds double, ptr %x, i64 13 + %12 = load double, ptr %arrayidx22, align 8 + %mul23 = fmul reassoc nsz arcp contract afn double %mul21, %12 + %arrayidx24 = getelementptr inbounds double, ptr %x, i64 14 + %13 = load double, ptr %arrayidx24, align 8 + %mul25 = fmul reassoc nsz arcp contract afn double %mul23, %13 + %arrayidx26 = getelementptr inbounds double, ptr %x, i64 15 + %14 = load double, ptr %arrayidx26, align 8 + %mul27 = fmul reassoc nsz arcp contract afn double %mul25, %14 + %arrayidx28 = getelementptr inbounds double, ptr %x, i64 16 + %15 = load double, ptr %arrayidx28, align 8 + %mul29 = fmul reassoc nsz arcp contract afn double %mul27, %15 + ret double %mul29 +} + +define double @fmul_double_4_factors_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_4_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 32(%r2) +; Z13-NEXT: vleg %v0, 48(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_4_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 32(%r2) +; Z15-NEXT: vleg %v0, 48(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + ret double %mul5 +} + +define double @fmul_double_6_factors_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_6_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 32(%r2) +; Z13-NEXT: vleg %v0, 48(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: mdb %f0, 64(%r2) +; Z13-NEXT: mdb %f0, 80(%r2) +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_6_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 32(%r2) +; Z15-NEXT: vleg %v0, 48(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: mdb %f0, 64(%r2) +; Z15-NEXT: mdb %f0, 80(%r2) +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + ret double %mul9 +} + +define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_8_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 64(%r2) +; Z13-NEXT: vleg %v0, 80(%r2), 1 +; Z13-NEXT: vlrepg %v1, 0(%r2) +; Z13-NEXT: vleg %v1, 16(%r2), 1 +; Z13-NEXT: vlrepg %v2, 96(%r2) +; Z13-NEXT: vleg %v2, 112(%r2), 1 +; Z13-NEXT: vlrepg %v3, 32(%r2) +; Z13-NEXT: vleg %v3, 48(%r2), 1 +; Z13-NEXT: vfmdb %v2, %v3, %v2 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vfmdb %v0, %v0, %v2 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_8_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 64(%r2) +; Z15-NEXT: vleg %v0, 80(%r2), 1 +; Z15-NEXT: vlrepg %v1, 0(%r2) +; Z15-NEXT: vleg %v1, 16(%r2), 1 +; Z15-NEXT: vlrepg %v2, 96(%r2) +; Z15-NEXT: vleg %v2, 112(%r2), 1 +; Z15-NEXT: vlrepg %v3, 32(%r2) +; Z15-NEXT: vleg %v3, 48(%r2), 1 +; Z15-NEXT: vfmdb %v2, %v3, %v2 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vfmdb %v0, %v0, %v2 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define double @fmul_double_16_factors_nonseq(ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_double_16_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: vlrepg %v0, 160(%r2) +; Z13-NEXT: vleg %v0, 176(%r2), 1 +; Z13-NEXT: vlrepg %v1, 32(%r2) +; Z13-NEXT: vleg %v1, 48(%r2), 1 +; Z13-NEXT: vlrepg %v2, 224(%r2) +; Z13-NEXT: vleg %v2, 240(%r2), 1 +; Z13-NEXT: vlrepg %v3, 96(%r2) +; Z13-NEXT: vleg %v3, 112(%r2), 1 +; Z13-NEXT: vlrepg %v4, 128(%r2) +; Z13-NEXT: vleg %v4, 144(%r2), 1 +; Z13-NEXT: vlrepg %v5, 0(%r2) +; Z13-NEXT: vleg %v5, 16(%r2), 1 +; Z13-NEXT: vlrepg %v6, 192(%r2) +; Z13-NEXT: vfmdb %v4, %v5, %v4 +; Z13-NEXT: vfmdb %v2, %v3, %v2 +; Z13-NEXT: vleg %v6, 208(%r2), 1 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vfmdb %v0, %v0, %v2 +; Z13-NEXT: vlrepg %v7, 64(%r2) +; Z13-NEXT: vleg %v7, 80(%r2), 1 +; Z13-NEXT: vfmdb %v6, %v7, %v6 +; Z13-NEXT: vfmdb %v1, %v4, %v6 +; Z13-NEXT: vfmdb %v0, %v1, %v0 +; Z13-NEXT: vrepg %v1, %v0, 1 +; Z13-NEXT: vfmdb %v0, %v0, %v1 +; Z13-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_double_16_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepg %v0, 160(%r2) +; Z15-NEXT: vleg %v0, 176(%r2), 1 +; Z15-NEXT: vlrepg %v1, 32(%r2) +; Z15-NEXT: vleg %v1, 48(%r2), 1 +; Z15-NEXT: vlrepg %v2, 224(%r2) +; Z15-NEXT: vleg %v2, 240(%r2), 1 +; Z15-NEXT: vlrepg %v3, 96(%r2) +; Z15-NEXT: vleg %v3, 112(%r2), 1 +; Z15-NEXT: vlrepg %v4, 128(%r2) +; Z15-NEXT: vleg %v4, 144(%r2), 1 +; Z15-NEXT: vlrepg %v5, 0(%r2) +; Z15-NEXT: vleg %v5, 16(%r2), 1 +; Z15-NEXT: vlrepg %v6, 192(%r2) +; Z15-NEXT: vfmdb %v4, %v5, %v4 +; Z15-NEXT: vfmdb %v2, %v3, %v2 +; Z15-NEXT: vleg %v6, 208(%r2), 1 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vfmdb %v0, %v0, %v2 +; Z15-NEXT: vlrepg %v7, 64(%r2) +; Z15-NEXT: vleg %v7, 80(%r2), 1 +; Z15-NEXT: vfmdb %v6, %v7, %v6 +; Z15-NEXT: vfmdb %v1, %v4, %v6 +; Z15-NEXT: vfmdb %v0, %v1, %v0 +; Z15-NEXT: vrepg %v1, %v0, 1 +; Z15-NEXT: vfmdb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0d killed $f0d killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + %arrayidx14 = getelementptr inbounds double, ptr %x, i64 16 + %8 = load double, ptr %arrayidx14, align 8 + %mul15 = fmul reassoc nsz arcp contract afn double %mul13, %8 + %arrayidx16 = getelementptr inbounds double, ptr %x, i64 18 + %9 = load double, ptr %arrayidx16, align 8 + %mul17 = fmul reassoc nsz arcp contract afn double %mul15, %9 + %arrayidx18 = getelementptr inbounds double, ptr %x, i64 20 + %10 = load double, ptr %arrayidx18, align 8 + %mul19 = fmul reassoc nsz arcp contract afn double %mul17, %10 + %arrayidx20 = getelementptr inbounds double, ptr %x, i64 22 + %11 = load double, ptr %arrayidx20, align 8 + %mul21 = fmul reassoc nsz arcp contract afn double %mul19, %11 + %arrayidx22 = getelementptr inbounds double, ptr %x, i64 24 + %12 = load double, ptr %arrayidx22, align 8 + %mul23 = fmul reassoc nsz arcp contract afn double %mul21, %12 + %arrayidx24 = getelementptr inbounds double, ptr %x, i64 26 + %13 = load double, ptr %arrayidx24, align 8 + %mul25 = fmul reassoc nsz arcp contract afn double %mul23, %13 + %arrayidx26 = getelementptr inbounds double, ptr %x, i64 28 + %14 = load double, ptr %arrayidx26, align 8 + %mul27 = fmul reassoc nsz arcp contract afn double %mul25, %14 + %arrayidx28 = getelementptr inbounds double, ptr %x, i64 30 + %15 = load double, ptr %arrayidx28, align 8 + %mul29 = fmul reassoc nsz arcp contract afn double %mul27, %15 + ret double %mul29 +} + +define float @fmul_float_4_factors_seq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_4_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 12(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_4_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + ret float %mul5 +} + +define float @fmul_float_6_factors_seq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_6_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 12(%r2) +; Z13-NEXT: meeb %f0, 20(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_6_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: meeb %f0, 20(%r2) +; Z15-NEXT: meeb %f0, 24(%r2) +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 5 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 6 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + ret float %mul9 +} + +define float @fmul_float_8_factors_seq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_8_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 12(%r2) +; Z13-NEXT: meeb %f0, 20(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: meeb %f0, 28(%r2) +; Z13-NEXT: meeb %f0, 32(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_8_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vl %v1, 20(%r2) +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 5 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 6 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 7 + %6 = load float, ptr %arrayidx10, align 4 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 8 + %7 = load float, ptr %arrayidx12, align 4 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + ret float %mul13 +} + +define float @fmul_float_16_factors_seq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_16_factors_seq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 4(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 12(%r2) +; Z13-NEXT: meeb %f0, 20(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: meeb %f0, 28(%r2) +; Z13-NEXT: meeb %f0, 32(%r2) +; Z13-NEXT: meeb %f0, 36(%r2) +; Z13-NEXT: meeb %f0, 40(%r2) +; Z13-NEXT: meeb %f0, 44(%r2) +; Z13-NEXT: meeb %f0, 48(%r2) +; Z13-NEXT: meeb %f0, 52(%r2) +; Z13-NEXT: meeb %f0, 56(%r2) +; Z13-NEXT: meeb %f0, 60(%r2) +; Z13-NEXT: meeb %f0, 64(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_16_factors_seq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r2) +; Z15-NEXT: vl %v1, 36(%r2) +; Z15-NEXT: vl %v2, 20(%r2) +; Z15-NEXT: vl %v3, 52(%r2) +; Z15-NEXT: vfmsb %v2, %v2, %v3 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vfmsb %v0, %v0, %v2 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 5 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 6 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 7 + %6 = load float, ptr %arrayidx10, align 4 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 8 + %7 = load float, ptr %arrayidx12, align 4 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 9 + %8 = load float, ptr %arrayidx14, align 4 + %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 10 + %9 = load float, ptr %arrayidx16, align 4 + %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 11 + %10 = load float, ptr %arrayidx18, align 4 + %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 12 + %11 = load float, ptr %arrayidx20, align 4 + %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 13 + %12 = load float, ptr %arrayidx22, align 4 + %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 14 + %13 = load float, ptr %arrayidx24, align 4 + %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 15 + %14 = load float, ptr %arrayidx26, align 4 + %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 16 + %15 = load float, ptr %arrayidx28, align 4 + %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15 + ret float %mul29 +} + +define float @fmul_float_4_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_4_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 16(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_4_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 0(%r2) +; Z15-NEXT: vlef %v0, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 24(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + ret float %mul5 +} + +define float @fmul_float_6_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_6_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 16(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: meeb %f0, 32(%r2) +; Z13-NEXT: meeb %f0, 40(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_6_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 0(%r2) +; Z15-NEXT: vlef %v0, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 24(%r2), 3 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: meeb %f0, 32(%r2) +; Z15-NEXT: meeb %f0, 40(%r2) +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + ret float %mul9 +} + +define float @fmul_float_8_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_8_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 16(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: meeb %f0, 32(%r2) +; Z13-NEXT: meeb %f0, 40(%r2) +; Z13-NEXT: meeb %f0, 48(%r2) +; Z13-NEXT: meeb %f0, 56(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_8_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 32(%r2) +; Z15-NEXT: vlef %v0, 40(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlef %v0, 48(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v0, 56(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vfmsb %v0, %v1, %v0 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + ret float %mul13 +} + +define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_float_16_factors_nonseq: +; Z13: # %bb.0: # %entry +; Z13-NEXT: lde %f0, 8(%r2) +; Z13-NEXT: meeb %f0, 0(%r2) +; Z13-NEXT: meeb %f0, 16(%r2) +; Z13-NEXT: meeb %f0, 24(%r2) +; Z13-NEXT: meeb %f0, 32(%r2) +; Z13-NEXT: meeb %f0, 40(%r2) +; Z13-NEXT: meeb %f0, 48(%r2) +; Z13-NEXT: meeb %f0, 56(%r2) +; Z13-NEXT: meeb %f0, 64(%r2) +; Z13-NEXT: meeb %f0, 72(%r2) +; Z13-NEXT: meeb %f0, 80(%r2) +; Z13-NEXT: meeb %f0, 88(%r2) +; Z13-NEXT: meeb %f0, 96(%r2) +; Z13-NEXT: meeb %f0, 104(%r2) +; Z13-NEXT: meeb %f0, 112(%r2) +; Z13-NEXT: meeb %f0, 120(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_float_16_factors_nonseq: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vlrepf %v0, 64(%r2) +; Z15-NEXT: vlef %v0, 72(%r2), 1 +; Z15-NEXT: vlrepf %v1, 0(%r2) +; Z15-NEXT: vlef %v1, 8(%r2), 1 +; Z15-NEXT: vlrepf %v2, 96(%r2) +; Z15-NEXT: vlef %v2, 104(%r2), 1 +; Z15-NEXT: vlrepf %v3, 32(%r2) +; Z15-NEXT: vlef %v3, 40(%r2), 1 +; Z15-NEXT: vlef %v0, 80(%r2), 2 +; Z15-NEXT: vlef %v1, 16(%r2), 2 +; Z15-NEXT: vlef %v2, 112(%r2), 2 +; Z15-NEXT: vlef %v3, 48(%r2), 2 +; Z15-NEXT: vlef %v0, 88(%r2), 3 +; Z15-NEXT: vlef %v1, 24(%r2), 3 +; Z15-NEXT: vfmsb %v0, %v1, %v0 +; Z15-NEXT: vlef %v2, 120(%r2), 3 +; Z15-NEXT: vlef %v3, 56(%r2), 3 +; Z15-NEXT: vfmsb %v2, %v3, %v2 +; Z15-NEXT: vfmsb %v0, %v0, %v2 +; Z15-NEXT: vmrlg %v1, %v0, %v0 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: vrepf %v1, %v0, 1 +; Z15-NEXT: vfmsb %v0, %v0, %v1 +; Z15-NEXT: # kill: def $f0s killed $f0s killed $v0 +; Z15-NEXT: br %r14 +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16 + %8 = load float, ptr %arrayidx14, align 4 + %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18 + %9 = load float, ptr %arrayidx16, align 4 + %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20 + %10 = load float, ptr %arrayidx18, align 4 + %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22 + %11 = load float, ptr %arrayidx20, align 4 + %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24 + %12 = load float, ptr %arrayidx22, align 4 + %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26 + %13 = load float, ptr %arrayidx24, align 4 + %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28 + %14 = load float, ptr %arrayidx26, align 4 + %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30 + %15 = load float, ptr %arrayidx28, align 4 + %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15 + ret float %mul29 +} + +define void @fmul_fp128_8_factors(ptr noalias nocapture writeonly sret(fp128) align 8 %agg.result, ptr nocapture noundef readonly %x) { +; Z13-LABEL: fmul_fp128_8_factors: +; Z13: # %bb.0: # %entry +; Z13-NEXT: ld %f0, 0(%r3) +; Z13-NEXT: ld %f2, 8(%r3) +; Z13-NEXT: ld %f1, 16(%r3) +; Z13-NEXT: ld %f3, 24(%r3) +; Z13-NEXT: mxbr %f1, %f0 +; Z13-NEXT: ld %f0, 32(%r3) +; Z13-NEXT: ld %f2, 40(%r3) +; Z13-NEXT: mxbr %f0, %f1 +; Z13-NEXT: ld %f1, 48(%r3) +; Z13-NEXT: ld %f3, 56(%r3) +; Z13-NEXT: mxbr %f1, %f0 +; Z13-NEXT: ld %f0, 80(%r3) +; Z13-NEXT: ld %f2, 88(%r3) +; Z13-NEXT: mxbr %f0, %f1 +; Z13-NEXT: ld %f1, 96(%r3) +; Z13-NEXT: ld %f3, 104(%r3) +; Z13-NEXT: mxbr %f1, %f0 +; Z13-NEXT: ld %f0, 112(%r3) +; Z13-NEXT: ld %f2, 120(%r3) +; Z13-NEXT: mxbr %f0, %f1 +; Z13-NEXT: ld %f1, 128(%r3) +; Z13-NEXT: ld %f3, 136(%r3) +; Z13-NEXT: mxbr %f1, %f0 +; Z13-NEXT: std %f1, 0(%r2) +; Z13-NEXT: std %f3, 8(%r2) +; Z13-NEXT: br %r14 +; +; Z15-LABEL: fmul_fp128_8_factors: +; Z15: # %bb.0: # %entry +; Z15-NEXT: vl %v0, 0(%r3), 3 +; Z15-NEXT: vl %v1, 16(%r3), 3 +; Z15-NEXT: vl %v2, 32(%r3), 3 +; Z15-NEXT: vl %v3, 48(%r3), 3 +; Z15-NEXT: vl %v4, 80(%r3), 3 +; Z15-NEXT: vl %v5, 96(%r3), 3 +; Z15-NEXT: vl %v6, 112(%r3), 3 +; Z15-NEXT: vl %v7, 128(%r3), 3 +; Z15-NEXT: wfmxb %v2, %v2, %v6 +; Z15-NEXT: wfmxb %v0, %v0, %v4 +; Z15-NEXT: wfmxb %v3, %v3, %v7 +; Z15-NEXT: wfmxb %v1, %v1, %v5 +; Z15-NEXT: wfmxb %v1, %v1, %v3 +; Z15-NEXT: wfmxb %v0, %v0, %v2 +; Z15-NEXT: wfmxb %v0, %v0, %v1 +; Z15-NEXT: vst %v0, 0(%r2), 3 +; Z15-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn fp128 %1, %0 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 5 + %4 = load fp128, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 6 + %5 = load fp128, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 7 + %6 = load fp128, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 8 + %7 = load fp128, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7 + store fp128 %mul13, ptr %agg.result, align 8 + ret void +}