Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -966,27 +966,29 @@ ReductionFlags Flags) const; /// \returns True if the target wants to handle the given reduction idiom in - /// scalarized shuffle form instead of vectorized shuffle form. + /// variable-length-vector shuffle form instead of fixed-length-vector + /// shuffle form (which gets generated by getShuffleReduction()). /// E.g. /// - /// Scalarized shuffle form: - /// %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> - /// - /// %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf - /// %0 = extractelement <4 x float> %bin.rdx, i32 0 - /// %1 = extractelement <4 x float> %bin.rdx, i32 1 - /// %res = fadd fast float %0, %1 // scalar operation follows. + /// Variable-length-vector shuffle form: + /// %rdx.shuf1 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + /// + /// %rdx.shuf2 = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + /// + /// %bin.rdx = fadd fast <2 x float> %rdx.shuf1, %rdx.shuf2 + /// %0 = extractelement <2 x float> %bin.rdx, i32 0 + /// %1 = extractelement <2 x float> %bin.rdx, i32 1 + /// %res = fadd fast float %0, %1 /// - /// Vectorized shuffle form: + /// Fixed-length-vector shuffle form: /// %rdx.shuf = shufflevector <4 x float> %a, <4 x float> undef, /// <4 x i32> /// %bin.rdx = fadd fast <4 x float> %a, %rdx.shuf /// %rdx.shuf1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, /// <4 x i32> - /// %bin.rdx2 = fadd fast <4 x float> %bin.rdx, %rdx.shuf1 // vector operation - /// // follows. + /// %bin.rdx2 = fadd fast <4 x float> %bin.rdx, %rdx.shuf1 /// %res = extractelement <4 x float> %bin.rdx2, i32 0 - bool useScalarizedShuffleReduction() const; + bool useVariableLengthShuffleReduction() const; /// \returns True if the target wants to expand the given reduction intrinsic /// into a shuffle sequence. @@ -1189,7 +1191,7 @@ VectorType *VecTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; - virtual bool useScalarizedShuffleReduction() const = 0; + virtual bool useVariableLengthShuffleReduction() const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual int getInstructionLatency(const Instruction *I) = 0; }; @@ -1585,8 +1587,8 @@ ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); } - bool useScalarizedShuffleReduction() const override { - return Impl.useScalarizedShuffleReduction(); + bool useVariableLengthShuffleReduction() const override { + return Impl.useVariableLengthShuffleReduction(); } bool shouldExpandReduction(const IntrinsicInst *II) const override { return Impl.shouldExpandReduction(II); Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -564,7 +564,7 @@ return false; } - bool useScalarizedShuffleReduction() const { + bool useVariableLengthShuffleReduction() const { return false; } Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -517,14 +517,10 @@ ArrayRef RedOps = None); /// Generates a vector reduction using shufflevectors to reduce the value. -/// If \p ScalarizationFollows is set, getShuffleReduction() generates -/// scalar result instead of vector result (Shuffles are followed by a -/// scalar operation instead of a vector operation). Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = RecurrenceDescriptor::MRK_Invalid, - ArrayRef RedOps = None, - bool ScalarizationFollows = false); + ArrayRef RedOps = None); /// Create a target reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -622,8 +622,8 @@ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); } -bool TargetTransformInfo::useScalarizedShuffleReduction() const { - return TTIImpl->useScalarizedShuffleReduction(); +bool TargetTransformInfo::useVariableLengthShuffleReduction() const { + return TTIImpl->useVariableLengthShuffleReduction(); } bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -26,6 +26,7 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -1581,10 +1582,44 @@ // Helper to generate a log2 shuffle reduction. Value * +getVariableLengthShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { + unsigned VF = Src->getType()->getVectorNumElements(); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = Src; + + for (unsigned i = VF; i != 2; i >>= 1) { + // Extract the lower half. + Value *Shuf1 = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + createSequentialMask(Builder, 0, i/2, 0), "rdx.shuf1"); + + // Extract the uppoer half. + Value *Shuf2 = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + createSequentialMask(Builder, i / 2, i / 2, 0), "rdx.shuf2"); + TmpVec = createReductionOp(Builder, Shuf1, Shuf2, Op, MinMaxKind, RedOps); + } + + // The result comes from performing the scalar operation on the first two + // elements of the vector. + return createReductionOp( + Builder, + Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)), + Builder.CreateExtractElement(TmpVec, Builder.getInt32(1)), + Op, MinMaxKind, RedOps); +} + +// Helper to generate a log2 shuffle reduction. +Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, - ArrayRef RedOps, - bool ScalarizedShufRed) { + ArrayRef RedOps) { unsigned VF = Src->getType()->getVectorNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -1593,9 +1628,7 @@ "Reduction emission only supported for pow2 vectors!"); Value *TmpVec = Src; SmallVector ShuffleMask(VF, nullptr); - unsigned UB = ScalarizedShufRed ? 2 : 1; - - for (unsigned i = VF; i != UB; i >>= 1) { + for (unsigned i = VF; i != 1; i >>= 1) { // Move the upper half of the vector to the lower half. for (unsigned j = 0; j != i / 2; ++j) ShuffleMask[j] = Builder.getInt32(i / 2 + j); @@ -1610,17 +1643,8 @@ TmpVec = createReductionOp(Builder, TmpVec, Shuf, Op, MinMaxKind, RedOps); } - if (!ScalarizedShufRed) - // The result is in the first element of the vector. - return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - - // The result comes from performing the scalar operation on the first two - // elements of the vector. - return createReductionOp( - Builder, - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)), - Builder.CreateExtractElement(TmpVec, Builder.getInt32(1)), - Op, MinMaxKind, RedOps); + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); } /// Create a simple vector reduction specified by an opcode and some @@ -1698,8 +1722,10 @@ if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); - return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps, - TTI->useScalarizedShuffleReduction()); + if (TTI->useVariableLengthShuffleReduction()) + return getVariableLengthShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); + + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); } /// Create a vector reduction using a given recurrence descriptor.