Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -753,6 +753,9 @@ bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; + /// \returns True if the target wants to expand the given reduction intrinsic + /// into a shuffle sequence. + bool shouldExpandReduction(const IntrinsicInst *II) const; /// @} private: @@ -910,6 +913,7 @@ VectorType *VecTy) const = 0; virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; + virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; }; template @@ -1219,6 +1223,9 @@ ReductionFlags Flags) const override { return Impl.useReductionIntrinsic(Opcode, Ty, Flags); } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + return Impl.shouldExpandReduction(II); + } }; template Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -462,6 +462,10 @@ return false; } + bool shouldExpandReduction(const IntrinsicInst *II) const { + return true; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. Index: llvm/trunk/include/llvm/CodeGen/ExpandReductions.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/ExpandReductions.h +++ llvm/trunk/include/llvm/CodeGen/ExpandReductions.h @@ -0,0 +1,24 @@ +//===----- ExpandReductions.h - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EXPANDREDUCTIONS_H +#define LLVM_CODEGEN_EXPANDREDUCTIONS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ExpandReductionsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_CODEGEN_EXPANDREDUCTIONS_H Index: llvm/trunk/include/llvm/CodeGen/Passes.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/Passes.h +++ llvm/trunk/include/llvm/CodeGen/Passes.h @@ -405,6 +405,10 @@ /// printing assembly. ModulePass *createMachineOutlinerPass(); + /// This pass expands the experimental reduction intrinsics into sequences of + /// shuffles. + FunctionPass *createExpandReductionsPass(); + } // End llvm namespace /// Target machine pass initializer for passes with dependencies. Use with Index: llvm/trunk/include/llvm/InitializePasses.h =================================================================== --- llvm/trunk/include/llvm/InitializePasses.h +++ llvm/trunk/include/llvm/InitializePasses.h @@ -130,6 +130,7 @@ void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); +void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); Index: llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/trunk/include/llvm/Transforms/Utils/LoopUtils.h @@ -491,6 +491,12 @@ LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE = nullptr); +/// Generates a vector reduction using shufflevectors to reduce the value. +Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind + MinMaxKind = RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = ArrayRef()); + /// Create a target reduction of the given vector. The reduction operation /// is described by the \p Opcode parameter. min/max reductions require /// additional information supplied in \p Flags. Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -505,6 +505,9 @@ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); } +bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { + return TTIImpl->shouldExpandReduction(II); +} TargetTransformInfo::Concept::~Concept() {} Index: llvm/trunk/lib/CodeGen/CMakeLists.txt =================================================================== --- llvm/trunk/lib/CodeGen/CMakeLists.txt +++ llvm/trunk/lib/CodeGen/CMakeLists.txt @@ -23,6 +23,7 @@ ExecutionDepsFix.cpp ExpandISelPseudos.cpp ExpandPostRAPseudos.cpp + ExpandReductions.cpp FaultMaps.cpp FEntryInserter.cpp FuncletLayout.cpp Index: llvm/trunk/lib/CodeGen/ExpandReductions.cpp =================================================================== --- llvm/trunk/lib/CodeGen/ExpandReductions.cpp +++ llvm/trunk/lib/CodeGen/ExpandReductions.cpp @@ -0,0 +1,167 @@ +//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for reduction intrinsics, allowing targets +// to enable the experimental intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ExpandReductions.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +unsigned getOpcode(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + return Instruction::FAdd; + case Intrinsic::experimental_vector_reduce_fmul: + return Instruction::FMul; + case Intrinsic::experimental_vector_reduce_add: + return Instruction::Add; + case Intrinsic::experimental_vector_reduce_mul: + return Instruction::Mul; + case Intrinsic::experimental_vector_reduce_and: + return Instruction::And; + case Intrinsic::experimental_vector_reduce_or: + return Instruction::Or; + case Intrinsic::experimental_vector_reduce_xor: + return Instruction::Xor; + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + return Instruction::ICmp; + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + return Instruction::FCmp; + default: + llvm_unreachable("Unexpected ID"); + } +} + +RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_smax: + return RecurrenceDescriptor::MRK_SIntMax; + case Intrinsic::experimental_vector_reduce_smin: + return RecurrenceDescriptor::MRK_SIntMin; + case Intrinsic::experimental_vector_reduce_umax: + return RecurrenceDescriptor::MRK_UIntMax; + case Intrinsic::experimental_vector_reduce_umin: + return RecurrenceDescriptor::MRK_UIntMin; + case Intrinsic::experimental_vector_reduce_fmax: + return RecurrenceDescriptor::MRK_FloatMax; + case Intrinsic::experimental_vector_reduce_fmin: + return RecurrenceDescriptor::MRK_FloatMin; + default: + return RecurrenceDescriptor::MRK_Invalid; + } +} + +bool expandReductions(Function &F, const TargetTransformInfo *TTI) { + bool Changed = false; + SmallVector Worklist; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (auto II = dyn_cast(&*I)) + Worklist.push_back(II); + + for (auto *II : Worklist) { + IRBuilder<> Builder(II); + Value *Vec = nullptr; + auto ID = II->getIntrinsicID(); + auto MRK = RecurrenceDescriptor::MRK_Invalid; + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + // FMFs must be attached to the call, otherwise it's an ordered reduction + // and it can't be handled by generating this shuffle sequence. + // TODO: Implement scalarization of ordered reductions here for targets + // without native support. + if (!II->getFastMathFlags().unsafeAlgebra()) + continue; + Vec = II->getArgOperand(1); + break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + Vec = II->getArgOperand(0); + MRK = getMRK(ID); + break; + default: + continue; + } + if (!TTI->shouldExpandReduction(II)) + continue; + auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + II->replaceAllUsesWith(Rdx); + II->eraseFromParent(); + Changed = true; + } + return Changed; +} + +class ExpandReductions : public FunctionPass { +public: + static char ID; + ExpandReductions() : FunctionPass(ID) { + initializeExpandReductionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const auto *TTI =&getAnalysis().getTTI(F); + return expandReductions(F, TTI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; +} + +char ExpandReductions::ID; +INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) + +FunctionPass *llvm::createExpandReductionsPass() { + return new ExpandReductions(); +} + +PreservedAnalyses ExpandReductionsPass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &TTI = AM.getResult(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} Index: llvm/trunk/lib/CodeGen/TargetPassConfig.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetPassConfig.cpp +++ llvm/trunk/lib/CodeGen/TargetPassConfig.cpp @@ -487,6 +487,9 @@ // Insert calls to mcount-like functions. addPass(createCountingFunctionInserterPass()); + + // Expand reduction intrinsics into shuffle sequences if the target wants to. + addPass(createExpandReductionsPass()); } /// Turn exception handling constructs into something the code generators can Index: llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -137,6 +137,10 @@ unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } /// @} }; Index: llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUtils.cpp @@ -1125,11 +1125,10 @@ } // Helper to generate a log2 shuffle reduction. -static Value * -getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, - RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = - RecurrenceDescriptor::MRK_Invalid, - ArrayRef RedOps = ArrayRef()) { +Value * +llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { unsigned VF = Src->getType()->getVectorNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each Index: llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll =================================================================== --- llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll +++ llvm/trunk/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -expand-reductions -S | FileCheck %s +; Tests without a target which should expand all reductions +declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) + +declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>) + +declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>) + + +define i64 @add_i64(<2 x i64> %vec) { +; CHECK-LABEL: @add_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @mul_i64(<2 x i64> %vec) { +; CHECK-LABEL: @mul_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @and_i64(<2 x i64> %vec) { +; CHECK-LABEL: @and_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @or_i64(<2 x i64> %vec) { +; CHECK-LABEL: @or_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @xor_i64(<2 x i64> %vec) { +; CHECK-LABEL: @xor_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define float @fadd_f32(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fadd_f32_strict(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32_strict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) +; CHECK-NEXT: ret float [[R]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fmul_f32(<4 x float> %vec) { +; CHECK-LABEL: @fmul_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define i64 @smax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @smin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define double @fmax_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmax_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec) + ret double %r +} + +define double @fmin_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmin_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec) + ret double %r +} Index: llvm/trunk/test/CodeGen/X86/O0-pipeline.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/O0-pipeline.ll +++ llvm/trunk/test/CodeGen/X86/O0-pipeline.ll @@ -23,6 +23,7 @@ ; CHECK-NEXT: Shadow Stack GC Lowering ; CHECK-NEXT: Remove unreachable blocks from the CFG ; CHECK-NEXT: Inserts calls to mcount-like functions +; CHECK-NEXT: Expand reduction intrinsics ; CHECK-NEXT: Rewrite Symbols ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Dominator Tree Construction Index: llvm/trunk/tools/llc/llc.cpp =================================================================== --- llvm/trunk/tools/llc/llc.cpp +++ llvm/trunk/tools/llc/llc.cpp @@ -301,6 +301,7 @@ initializeConstantHoistingLegacyPassPass(*Registry); initializeScalarOpts(*Registry); initializeVectorization(*Registry); + initializeExpandReductionsPass(*Registry); // Register the target printer for --version. cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion); Index: llvm/trunk/tools/opt/opt.cpp =================================================================== --- llvm/trunk/tools/opt/opt.cpp +++ llvm/trunk/tools/opt/opt.cpp @@ -397,6 +397,7 @@ initializeInterleavedAccessPass(Registry); initializeCountingFunctionInserterPass(Registry); initializeUnreachableBlockElimLegacyPassPass(Registry); + initializeExpandReductionsPass(Registry); #ifdef LINK_POLLY_INTO_TOOLS polly::initializePollyPasses(Registry);