Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -509,11 +509,18 @@ LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE = nullptr); +/// Generates an ordered vector reduction using extracts to reduce the value. +Value * +getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = + RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = None); + /// Generates a vector reduction using shufflevectors to reduce the value. Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = RecurrenceDescriptor::MRK_Invalid, - ArrayRef RedOps = ArrayRef()); + ArrayRef RedOps = None); /// Create a target reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require Index: lib/CodeGen/ExpandReductions.cpp =================================================================== --- lib/CodeGen/ExpandReductions.cpp +++ lib/CodeGen/ExpandReductions.cpp @@ -78,13 +78,15 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) { bool Changed = false; - SmallVector Worklist; + SmallVector Worklist; for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (auto II = dyn_cast(&*I)) Worklist.push_back(II); for (auto *II : Worklist) { IRBuilder<> Builder(II); + bool IsOrdered = false; + Value *Acc = nullptr; Value *Vec = nullptr; auto ID = II->getIntrinsicID(); auto MRK = RecurrenceDescriptor::MRK_Invalid; @@ -92,11 +94,10 @@ case Intrinsic::experimental_vector_reduce_fadd: case Intrinsic::experimental_vector_reduce_fmul: // FMFs must be attached to the call, otherwise it's an ordered reduction - // and it can't be handled by generating this shuffle sequence. - // TODO: Implement scalarization of ordered reductions here for targets - // without native support. + // and it can't be handled by generating a shuffle sequence. if (!II->getFastMathFlags().isFast()) - continue; + IsOrdered = true; + Acc = II->getArgOperand(0); Vec = II->getArgOperand(1); break; case Intrinsic::experimental_vector_reduce_add: @@ -118,7 +119,9 @@ } if (!TTI->shouldExpandReduction(II)) continue; - auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + Value *Rdx = + IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK) + : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); II->replaceAllUsesWith(Rdx); II->eraseFromParent(); Changed = true; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -1526,6 +1526,38 @@ return V; } +// Helper to generate an ordered reduction. +Value * +llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, + unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { + unsigned VF = Src->getType()->getVectorNumElements(); + + // Extract and apply reduction ops in ascending order: + // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1] + Value *Result = Acc; + for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) { + Value *Ext = + Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx)); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext, + "bin.rdx"); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result, + Ext); + } + + if (!RedOps.empty()) + propagateIRFlags(Result, RedOps); + } + + return Result; +} + // Helper to generate a log2 shuffle reduction. Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, Index: test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- test/CodeGen/Generic/expand-experimental-reductions.ll +++ test/CodeGen/Generic/expand-experimental-reductions.ll @@ -102,14 +102,39 @@ define float @fadd_f32_strict(<4 x float> %vec) { ; CHECK-LABEL: @fadd_f32_strict( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) -; CHECK-NEXT: ret float [[R]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] ; entry: %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) ret float %r } +define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) { +; CHECK-LABEL: @fadd_f32_strict_accum( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) + ret float %r +} + define float @fmul_f32(<4 x float> %vec) { ; CHECK-LABEL: @fmul_f32( ; CHECK-NEXT: entry: @@ -125,6 +150,42 @@ ret float %r } +define float @fmul_f32_strict(<4 x float> %vec) { +; CHECK-LABEL: @fmul_f32_strict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) { +; CHECK-LABEL: @fmul_f32_strict_accum( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT: [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT: [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT: ret float [[BIN_RDX3]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec) + ret float %r +} + define i64 @smax_i64(<2 x i64> %vec) { ; CHECK-LABEL: @smax_i64( ; CHECK-NEXT: entry: