Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -965,6 +965,29 @@ bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; + /// \returns True if the target wants to handle the given reduction idiom in + /// scalarized shuffle form instead of vectorized shuffle form. + /// E.g. + /// + /// Scalarized shuffle form: + /// %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> + /// + /// %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf + /// %0 = extractelement <4 x float> %bin.rdx, i32 0 + /// %1 = extractelement <4 x float> %bin.rdx, i32 1 + /// %res = fadd fast float %0, %1 // scalar operation follows. + /// + /// Vectorized shuffle form: + /// %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef, + /// <4 x i32> + /// %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf + /// %rdx.shuf1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, + /// <4 x i32> + /// %bin.rdx2 = fadd fast <4 x float> %bin.rdx, %rdx.shuf1 // vector operation + /// // follows. + /// %res = extractelement <4 x float> %bin.rdx2, i32 0 + bool useScalarizedShuffleReduction() const; + /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. bool shouldExpandReduction(const IntrinsicInst *II) const; @@ -1166,6 +1189,7 @@ VectorType *VecTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; + virtual bool useScalarizedShuffleReduction() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual int getInstructionLatency(const Instruction *I) = 0; }; @@ -1561,6 +1585,9 @@ ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); } + bool useScalarizedShuffleReduction() const override { + return Impl.useScalarizedShuffleReduction(); + } bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -564,6 +564,10 @@ return false; } + bool useScalarizedShuffleReduction() const { + return false; + } + bool shouldExpandReduction(const IntrinsicInst *II) const { return true; } Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -517,10 +517,14 @@ ArrayRef RedOps = None); /// Generates a vector reduction using shufflevectors to reduce the value. +/// If \p ScalarizationFollows is set, getShuffleReduction() generates +/// scalar result instead of vector result (Shuffles are followed by a +/// scalar operation instead of a vector operation). Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = RecurrenceDescriptor::MRK_Invalid, - ArrayRef RedOps = None); + ArrayRef RedOps = None, + bool ScalarizationFollows = false); /// Create a target reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -622,6 +622,10 @@ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); } +bool TargetTransformInfo::useScalarizedShuffleReduction() const { + return TTIImpl->useScalarizedShuffleReduction(); +} + bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { return TTIImpl->shouldExpandReduction(II); } Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1558,11 +1558,33 @@ return Result; } +// Helper to generate reduction operation. +Value *createReductionOp(IRBuilder<> &Builder, Value *Lhs, Value *Rhs, + unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { + Value *Res; + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // Floating point operations had to be 'fast' to enable the reduction. + Res = addFastMathFlag( + Builder.CreateBinOp((Instruction::BinaryOps)Op, Lhs, Rhs, "bin.rdx")); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + Res = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Lhs, Rhs); + } + if (!RedOps.empty()) + propagateIRFlags(Res, RedOps); + + return Res; +} + // Helper to generate a log2 shuffle reduction. Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, - ArrayRef RedOps) { + ArrayRef RedOps, + bool ScalarizedShufRed) { unsigned VF = Src->getType()->getVectorNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -1571,7 +1593,9 @@ "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = Src; SmallVector ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { + unsigned UB = ScalarizedShufRed ? 2 : 1; + + for (unsigned i = VF; i != UB; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i / 2; ++j) ShuffleMask[j] = Builder.getInt32(i / 2 + j); @@ -1583,22 +1607,20 @@ Value *Shuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, - TmpVec, Shuf, "bin.rdx")); - } else { - assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && - "Invalid min/max"); - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, - Shuf); - } - if (!RedOps.empty()) - propagateIRFlags(TmpVec, RedOps); + TmpVec = createReductionOp(Builder, TmpVec, Shuf, Op, MinMaxKind, RedOps); } - // The result is in the first element of the vector. - return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + if (!ScalarizedShufRed) + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + + // The result comes from performing the scalar operation on the first two + // elements of the vector. + return createReductionOp( + Builder, + Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)), + Builder.CreateExtractElement(TmpVec, Builder.getInt32(1)), + Op, MinMaxKind, RedOps); } /// Create a simple vector reduction specified by an opcode and some @@ -1675,7 +1697,9 @@ } if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); - return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); + + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps, + TTI->useScalarizedShuffleReduction()); } /// Create a vector reduction using a given recurrence descriptor.