diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1326,11 +1326,6 @@ bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present. }; - /// \returns True if the target wants to handle the given reduction idiom in - /// the intrinsics form instead of the shuffle form. - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - ReductionFlags Flags) const; - /// \returns True if the target prefers reductions in loop. bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const; @@ -1652,8 +1647,6 @@ virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; - virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - ReductionFlags) const = 0; virtual bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags) const = 0; virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, @@ -2183,10 +2176,6 @@ VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - ReductionFlags Flags) const override { - return Impl.useReductionIntrinsic(Opcode, Ty, Flags); - } bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const override { return Impl.preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -700,11 +700,6 @@ return VF; } - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - return false; - } - bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { return false; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1050,11 +1050,6 @@ return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } -bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty, - ReductionFlags Flags) const { - return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); -} - bool TargetTransformInfo::preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const { return TTIImpl->preferInLoopReduction(Opcode, Ty, Flags); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -266,9 +266,6 @@ bool supportsScalableVectors() const { return ST->hasSVE(); } - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const; - int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1089,31 +1089,6 @@ return Considerable; } -bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - auto *VTy = cast(Ty); - unsigned ScalarBits = Ty->getScalarSizeInBits(); - switch (Opcode) { - case Instruction::FAdd: - case Instruction::FMul: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: - case Instruction::Mul: - return false; - case Instruction::Add: - return ScalarBits * cast(VTy)->getNumElements() >= 128; - case Instruction::ICmp: - return (ScalarBits < 64) && - (ScalarBits * cast(VTy)->getNumElements() >= 128); - case Instruction::FCmp: - return Flags.NoNaN; - default: - llvm_unreachable("Unhandled reduction opcode"); - } - return false; -} - int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwise, bool IsUnsigned, TTI::TargetCostKind CostKind) { diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -186,9 +186,6 @@ int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp); - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const; - bool preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const; diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2091,11 +2091,6 @@ BaseT::getPeelingPreferences(L, SE, PP); } -bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - return ST->hasMVEIntegerOps(); -} - bool ARMTTIImpl::preferInLoopReduction(unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const { if (!ST->hasMVEIntegerOps()) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -231,14 +231,6 @@ bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); - /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded - /// into shuffles and vector math/logic by the backend - /// (see TTI::shouldExpandReduction) - bool useReductionIntrinsic(unsigned Opcode, Type *Ty, - TTI::ReductionFlags Flags) const { - return true; - } - private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, unsigned AddressSpace); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -54,11 +54,6 @@ using namespace llvm; using namespace llvm::PatternMatch; -static cl::opt ForceReductionIntrinsic( - "force-reduction-intrinsics", cl::Hidden, - cl::desc("Force creating reduction intrinsics for testing."), - cl::init(false)); - #define DEBUG_TYPE "loop-utils" static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced"; @@ -1025,14 +1020,10 @@ const TargetTransformInfo *TTI, Value *Src, RecurKind RdxKind, ArrayRef RedOps) { - unsigned Opcode = RecurrenceDescriptor::getOpcode(RdxKind); TargetTransformInfo::ReductionFlags RdxFlags; RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax || RdxKind == RecurKind::FMax; RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin; - if (!ForceReductionIntrinsic && - !TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags)) - return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps); auto *SrcVecEltTy = cast(Src->getType())->getElementType(); switch (RdxKind) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll @@ -5,10 +5,9 @@ @b = common local_unnamed_addr global i32 0, align 4 @a = common local_unnamed_addr global i16* null, align 8 -; Function Attrs: norecurse nounwind readonly define i32 @fn1() local_unnamed_addr #0 { -; Ensure that we don't emit reduction intrinsics for unsupported short reductions. -; CHECK-NOT: @llvm.vector.reduce +; We expect the backend to expand all reductions. +; CHECK: @llvm.vector.reduce entry: %0 = load i32, i32* @b, align 4, !tbaa !1 %cmp40 = icmp sgt i32 %0, 0 diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll @@ -62,9 +62,7 @@ ; GFX9-NEXT: [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]] ; GFX9-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]] ; GFX9-NEXT: [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]] -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> -; GFX9-NEXT: [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]] -; GFX9-NEXT: [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0 +; GFX9-NEXT: [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]]) ; GFX9-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; GFX9: scalar.ph: ; GFX9-NEXT: br label [[FOR_BODY:%.*]] @@ -132,9 +130,7 @@ ; VI-NEXT: [[BIN_RDX18:%.*]] = fadd fast <2 x half> [[TMP21]], [[BIN_RDX17]] ; VI-NEXT: [[BIN_RDX19:%.*]] = fadd fast <2 x half> [[TMP22]], [[BIN_RDX18]] ; VI-NEXT: [[BIN_RDX20:%.*]] = fadd fast <2 x half> [[TMP23]], [[BIN_RDX19]] -; VI-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x half> [[BIN_RDX20]], <2 x half> poison, <2 x i32> -; VI-NEXT: [[BIN_RDX21:%.*]] = fadd fast <2 x half> [[BIN_RDX20]], [[RDX_SHUF]] -; VI-NEXT: [[TMP25:%.*]] = extractelement <2 x half> [[BIN_RDX21]], i32 0 +; VI-NEXT: [[TMP25:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX20]]) ; VI-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VI: scalar.ph: ; VI-NEXT: br label [[FOR_BODY:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/sphinx.ll @@ -67,9 +67,7 @@ ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP16]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTEREND]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll --- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll @@ -22,9 +22,7 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP5]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[DOTLCSSA]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[DOTLCSSA]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]]) ; CHECK-NEXT: ret double [[TMP7]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll --- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-vfabi-attr.ll @@ -20,9 +20,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP5]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP5]]) ; CHECK-NEXT: ret double [[TMP7]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -14,8 +14,7 @@ ; CHECK: add i64 %index, 2, !dbg ![[LOC]] ; CHECK: icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]] ; CHECK: middle.block -; CHECK: add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[BR_LOC:[0-9]+]] -; CHECK: extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[BR_LOC]] +; CHECK: call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %{{.*}}), !dbg ![[BR_LOC:[0-9]+]] ; CHECK: for.body ; CHECK: br i1{{.*}}, label %for.body,{{.*}}, !dbg ![[BR_LOC]], ; CHECK: ![[BR_LOC]] = !DILocation(line: 5, diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -143,7 +143,7 @@ ; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> ; Check also that the casts were not moved needlessly. ; CHECK: sitofp <4 x i16> [[L1]] to <4 x double> -; CHECK: sitofp <4 x i16> [[SHUF]] to <4 x double> +; CHECK: sitofp <4 x i16> [[SHUF]] to <4 x double> ; CHECK: middle.block: ; CHECK: %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3 ; CHECK: scalar.ph: @@ -357,8 +357,8 @@ } ; We vectorize this first order recurrence, by generating two -; extracts for the phi `val.phi` - one at the last index and -; another at the second last index. We need these 2 extracts because +; extracts for the phi `val.phi` - one at the last index and +; another at the second last index. We need these 2 extracts because ; the first order recurrence phi is used outside the loop, so we require the phi ; itself and not its update (addx). ; UNROLL-NO-IC-LABEL: extract_second_last_iteration @@ -705,16 +705,12 @@ ; CHECK-NEXT: [[TMP21]] = phi <4 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP20]], [[PRED_UDIV_IF8]] ] ; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP21]], <4 x i32> ; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]] -; CHECK-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP24]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF10:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF10]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[BIN_RDX11]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]] +; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]]) ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[BB2:%.*]] @@ -834,17 +830,13 @@ ; CHECK-NEXT: store i32 [[TMP4]], i32* [[TMP38]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE16]] ; CHECK: pred.store.continue16: -; CHECK-NEXT: [[TMP39:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], -; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP39]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP39]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF17:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX18:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF17]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i32> [[BIN_RDX18]], i32 0 +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]] +; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[BB2:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/fix-reduction-dbg.ll b/llvm/test/Transforms/LoopVectorize/fix-reduction-dbg.ll --- a/llvm/test/Transforms/LoopVectorize/fix-reduction-dbg.ll +++ b/llvm/test/Transforms/LoopVectorize/fix-reduction-dbg.ll @@ -7,11 +7,7 @@ ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL:[0-9]+]] ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]] ; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]] -; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]] -; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]] -; CHECK-NEXT: %{{.*}}= shufflevector <4 x i32>{{.*}}, !dbg ![[DL]] -; CHECK-NEXT: %{{.*}}= add <4 x i32>{{.*}}, !dbg ![[DL]] -; CHECK-NEXT: %{{.*}}= extractelement <4 x i32>{{.*}}, !dbg ![[DL]] +; CHECK-NEXT: %{{.*}}= call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{.*}}), !dbg ![[DL]] ; CHECK-NEXT: %{{.*}}= icmp eq i64{{.*}}, !dbg ![[DL]] ; CHECK-NEXT: br i1 %{{.*}}, !dbg ![[DL]] ; CHECK: ![[DL]] = !DILocation(line: 5, diff --git a/llvm/test/Transforms/LoopVectorize/flags.ll b/llvm/test/Transforms/LoopVectorize/flags.ll --- a/llvm/test/Transforms/LoopVectorize/flags.ll +++ b/llvm/test/Transforms/LoopVectorize/flags.ll @@ -56,8 +56,7 @@ ; CHECK: load <4 x float> ; CHECK: fadd fast <4 x float> ; CHECK: br -; CHECK: fadd fast <4 x float> -; CHECK: fadd fast <4 x float> +; CHECK: call fast float @llvm.vector.reduce.fadd.v4f32 define float @fast_math(float* noalias %s) { entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll --- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -68,13 +68,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP5]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -387,10 +387,10 @@ } ; Double pattern: -; Check that is not vectorized if fp-instruction has no fast-math property. +; Check that is not vectorized if fp-instruction has no fast-math property. ; ; double fcmp_0_fsub_select2_notvectorize(double * restrict x, const int N) { -; double sum = 0. +; double sum = 0. ; for (int i = 0; i < N; ++i) ; if (x[i] > 0.) ; sum -= x[i]; @@ -468,7 +468,7 @@ } ; Float pattern: -; Check that is not vectorized if fp-instruction has no fast-math property. +; Check that is not vectorized if fp-instruction has no fast-math property. ; ; float fcmp_0_fmult_select1_notvectorize(float * restrict x, const int N) { ; float sum = 0. @@ -793,9 +793,10 @@ ; return sum; ; } -; CHECK-LABEL: @fcmp_store_back( -; CHECK-NOT: <4 x float> define float @fcmp_store_back(float* nocapture %a, i32 %LEN) nounwind readonly { +; CHECK-LABEL: @fcmp_store_back( +; CHECK-NOT: <4 x float> +; entry: %cmp7 = icmp sgt i32 %LEN, 0 br i1 %cmp7, label %for.body.preheader, label %for.end @@ -819,3 +820,6 @@ %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] ret float %sum.0.lcssa } + +; Make sure any check-not directives are not triggered by function declarations. +; CHECK: declare diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -504,7 +504,7 @@ ; CHECK: br i1 true, label %scalar.ph, label %vector.ph ; CHECK: middle.block: -; CHECK: %[[v9:.+]] = extractelement <2 x i32> %bin.rdx, i32 0 +; CHECK: %[[v9:.+]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> ; CHECK: scalar.ph: ; CHECK: %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %[[v0:.+]] ] ; CHECK: %bc.merge.rdx = phi i32 [ 1, %[[v0:.+]] ], [ %[[v9]], %middle.block ] diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -214,11 +214,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF4]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -865,16 +861,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[TMP3]], [[RDX_SHUF5]] -; CHECK-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <4 x i32> [[BIN_RDX6]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <4 x i32> [[BIN_RDX6]], [[RDX_SHUF7]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[BIN_RDX8]], i32 0 -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -1061,11 +1049,7 @@ ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP17]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP17]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP17]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -1259,11 +1243,7 @@ ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP30:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP20]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP20]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[BIN_RDX4]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP20]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/invariant-store-vectorization.ll @@ -27,7 +27,7 @@ ; CHECK-NEXT: br i1 ; CHECK-LABEL: middle.block: -; CHECK: %rdx.shuf = shufflevector <4 x i32> +; CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) { entry: %ntrunc = trunc i64 %n to i32 @@ -364,7 +364,8 @@ ; variant value stored to uniform address tests that the code gen extracts the ; last element from the variant vector and scalar stores it into the uniform ; address. -; CHECK-LABEL: variant_val_store_to_inv_address +define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @variant_val_store_to_inv_address( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N:%.*]], 1 ; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 @@ -389,20 +390,16 @@ ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 8, !alias.scope !36 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 -; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4, !alias.scope !39, !noalias !36 ; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP41:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[DOTLCSSA]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[DOTLCSSA]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[BIN_RDX6:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[DOTLCSSA]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -418,11 +415,14 @@ ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]], [[LOOP42:!llvm.loop !.*]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[TMP3_LCSSA:%.*]] = phi i32 [ [[TMP3]], [[FOR_BODY]] ] ; CHECK-NEXT: br label [[FOR_END]] -define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK: for.end: +; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[TMP3_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[RDX_LCSSA]] +; entry: %ntrunc = trunc i64 %n to i32 %cmp = icmp eq i32 %ntrunc, %k @@ -591,3 +591,6 @@ bb26: ret void } + +; Make sure any check-not directives are not triggered by function declarations. +; CHECK: declare diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1092,9 +1092,7 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP5]], [[RDX_SHUF]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 201, 200 ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll --- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll @@ -16,8 +16,7 @@ ; CHECK: icmp sgt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp sgt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smax.v2i32 define i32 @max_red(i32 %max) { entry: @@ -45,8 +44,7 @@ ; CHECK: icmp slt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp sgt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smax.v2i32 define i32 @max_red_inverse_select(i32 %max) { entry: @@ -73,8 +71,7 @@ ; CHECK: icmp slt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp slt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smin.v2i32 define i32 @min_red(i32 %max) { entry: @@ -102,8 +99,7 @@ ; CHECK: icmp sgt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp slt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smin.v2i32 define i32 @min_red_inverse_select(i32 %max) { entry: @@ -132,8 +128,7 @@ ; CHECK: icmp ugt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ugt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umax.v2i32 define i32 @umax_red(i32 %max) { entry: @@ -161,8 +156,7 @@ ; CHECK: icmp ult <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ugt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umax.v2i32 define i32 @umax_red_inverse_select(i32 %max) { entry: @@ -189,8 +183,7 @@ ; CHECK: icmp ult <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ult <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umin.v2i32 define i32 @umin_red(i32 %max) { entry: @@ -218,8 +211,7 @@ ; CHECK: icmp ugt <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ult <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umin.v2i32 define i32 @umin_red_inverse_select(i32 %max) { entry: @@ -247,8 +239,7 @@ ; CHECK: icmp sge <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp slt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smin.v2i32 define i32 @sge_min_red(i32 %max) { entry: @@ -276,8 +267,7 @@ ; CHECK: icmp sle <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp sgt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.smax.v2i32 define i32 @sle_min_red(i32 %max) { entry: @@ -305,8 +295,7 @@ ; CHECK: icmp uge <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ult <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umin.v2i32 define i32 @uge_min_red(i32 %max) { entry: @@ -334,8 +323,7 @@ ; CHECK: icmp ule <2 x i32> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: icmp ugt <2 x i32> -; CHECK: select <2 x i1> +; CHECK: call i32 @llvm.vector.reduce.umax.v2i32 define i32 @ule_min_red(i32 %max) { entry: @@ -415,8 +403,7 @@ ; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @max_red_float(float %max) #0 { entry: @@ -441,8 +428,7 @@ ; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @max_red_float_ge(float %max) #0 { entry: @@ -467,8 +453,7 @@ ; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @inverted_max_red_float(float %max) #0 { entry: @@ -493,8 +478,7 @@ ; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @inverted_max_red_float_le(float %max) #0 { entry: @@ -519,8 +503,7 @@ ; CHECK: fcmp fast ugt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @unordered_max_red_float(float %max) #0 { entry: @@ -545,8 +528,7 @@ ; CHECK: fcmp fast uge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @unordered_max_red_float_ge(float %max) #0 { entry: @@ -571,8 +553,7 @@ ; CHECK: fcmp fast ult <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @inverted_unordered_max_red_float(float %max) #0 { entry: @@ -597,8 +578,7 @@ ; CHECK: fcmp fast ule <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast ogt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmax.v2f32 define float @inverted_unordered_max_red_float_le(float %max) #0 { entry: @@ -626,8 +606,7 @@ ; CHECK: fcmp fast olt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @min_red_float(float %min) #0 { entry: @@ -652,8 +631,7 @@ ; CHECK: fcmp fast ole <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @min_red_float_le(float %min) #0 { entry: @@ -678,8 +656,7 @@ ; CHECK: fcmp fast ogt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @inverted_min_red_float(float %min) #0 { entry: @@ -704,8 +681,7 @@ ; CHECK: fcmp fast oge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @inverted_min_red_float_ge(float %min) #0 { entry: @@ -730,8 +706,7 @@ ; CHECK: fcmp fast ult <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @unordered_min_red_float(float %min) #0 { entry: @@ -756,8 +731,7 @@ ; CHECK: fcmp fast ule <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @unordered_min_red_float_le(float %min) #0 { entry: @@ -782,8 +756,7 @@ ; CHECK: fcmp fast ugt <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @inverted_unordered_min_red_float(float %min) #0 { entry: @@ -808,8 +781,7 @@ ; CHECK: fcmp fast uge <2 x float> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x float> -; CHECK: select fast <2 x i1> +; CHECK: call fast float @llvm.vector.reduce.fmin.v2f32 define float @inverted_unordered_min_red_float_ge(float %min) #0 { entry: @@ -835,8 +807,7 @@ ; CHECK: fcmp fast olt <2 x double> ; CHECK: select <2 x i1> ; CHECK: middle.block -; CHECK: fcmp fast olt <2 x double> -; CHECK: select fast <2 x i1> +; CHECK: call fast double @llvm.vector.reduce.fmin.v2f64 define double @min_red_double(double %min) #0 { entry: @@ -881,5 +852,7 @@ ret float %max.red.0 } +; Make sure any check-not directives are not triggered by function declarations. +; CHECK: declare attributes #0 = { "no-nans-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-predicated-reduction-select -dce -instcombine -S | FileCheck %s target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll --- a/llvm/test/Transforms/LoopVectorize/reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction.ll @@ -6,11 +6,7 @@ ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -41,11 +37,7 @@ ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: mul <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: mul <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: mul <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -76,11 +68,7 @@ ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: mul nsw <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -109,11 +97,7 @@ ;CHECK-LABEL: @reduction_mul( ;CHECK: mul <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: mul <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: mul <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp { %1 = icmp sgt i32 %n, 0 @@ -143,11 +127,7 @@ ;CHECK-LABEL: @start_at_non_zero( ;CHECK: phi <4 x i32> ;CHECK: -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp { entry: @@ -176,11 +156,7 @@ ;CHECK-LABEL: @reduction_and( ;CHECK: ;CHECK: and <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: and <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: and <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: @@ -208,11 +184,7 @@ ;CHECK-LABEL: @reduction_or( ;CHECK: or <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: or <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: or <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: @@ -240,11 +212,7 @@ ;CHECK-LABEL: @reduction_xor( ;CHECK: xor <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: xor <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: xor <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> ;CHECK: ret i32 define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly { entry: @@ -498,11 +466,7 @@ ;CHECK: phi <4 x i32> ;CHECK: load <4 x i32> ;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <4 x i32> -;CHECK: add <4 x i32> -;CHECK: extractelement <4 x i32> %{{.*}}, i32 0 +;CHECK: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> ;CHECK: %sum.copy = phi i32 [ %[[SCALAR:.*]], %.lr.ph ], [ %[[VECTOR:.*]], %middle.block ] ;CHECK: ret i32 define i32 @reduction_sum_multiuse(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) { @@ -577,3 +541,6 @@ store i32 %.0.lcssa, i32* %c10, align 4 ret void } + +; Make sure any check-not directives are not triggered by function declarations. +; CHECK: declare diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction.ll b/llvm/test/Transforms/LoopVectorize/select-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/select-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction.ll @@ -41,13 +41,7 @@ ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP6:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF5]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT7:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP6]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT7]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll @@ -21,18 +21,12 @@ ; GFX9-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP2]], [[TMP3]] ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[TMP2]], i32 [[TMP3]] ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) ; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]] +; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP5]], i32 [[SELECT1]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STORE_SELECT]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA4]] +; GFX9-NEXT: ret i32 [[OP_EXTRA1]] ; %load1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 0), align 16 %load2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 1), align 4 @@ -68,18 +62,12 @@ ; GFX9-NEXT: [[CMP1:%.*]] = icmp slt i64 [[TMP2]], [[TMP3]] ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i64 [[TMP2]], i64 [[TMP3]] ; GFX9-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 2) to <4 x i64>*), align 16 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i64> [[TMP4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i64> [[TMP4]], <4 x i64> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i64> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i64> [[RDX_MINMAX_SELECT]], <4 x i64> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> [[TMP4]]) ; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp slt i64 [[TMP5]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]] +; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i64 [[TMP5]], i64 [[SELECT1]] ; GFX9-NEXT: [[STORE_SELECT:%.*]] = select i1 [[CMP1]], i64 3, i64 4 ; GFX9-NEXT: store i64 [[STORE_SELECT]], i64* @var64, align 8 -; GFX9-NEXT: ret i64 [[OP_EXTRA4]] +; GFX9-NEXT: ret i64 [[OP_EXTRA1]] ; %load1 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 0), align 16 %load2 = load i64, i64* getelementptr inbounds ([32 x i64], [32 x i64]* @arr64, i64 0, i64 1), align 8 @@ -217,18 +205,12 @@ ; GFX9-NEXT: [[EX0:%.*]] = extractelement <2 x i32> [[VLOAD]], i32 0 ; GFX9-NEXT: [[SELECT1:%.*]] = select i1 [[CMP1]], i32 [[EX0]], i32 [[V1]] ; GFX9-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr, i64 0, i64 2) to <4 x i32>*), align 8 -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i32> [[TMP2]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP2]], <4 x i32> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]]) ; GFX9-NEXT: [[OP_EXTRA:%.*]] = icmp sgt i32 [[TMP3]], [[SELECT1]] -; GFX9-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]] +; GFX9-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[OP_EXTRA]], i32 [[TMP3]], i32 [[SELECT1]] ; GFX9-NEXT: [[STOREVAL:%.*]] = select i1 [[CMP1]], i32 3, i32 4 ; GFX9-NEXT: store i32 [[STOREVAL]], i32* @var, align 8 -; GFX9-NEXT: ret i32 [[OP_EXTRA4]] +; GFX9-NEXT: ret i32 [[OP_EXTRA1]] ; %vload = load <2 x i32>, <2 x i32>* bitcast ([32 x i32]* @arr to <2 x i32>*), align 16 %elt1 = extractelement <2 x i32> %vload, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/reduction.ll @@ -5,11 +5,7 @@ define half @reduction_half4(<4 x half> %a) { ; GFX9-LABEL: @reduction_half4( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <4 x i32> -; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x half> [[A]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x half> [[BIN_RDX]], <4 x half> poison, <4 x i32> -; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x half> [[BIN_RDX]], [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x half> [[BIN_RDX2]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH8000, <4 x half> [[A:%.*]]) ; GFX9-NEXT: ret half [[TMP0]] ; ; VI-LABEL: @reduction_half4( @@ -39,13 +35,7 @@ define half @reduction_half8(<8 x half> %vec8) { ; GFX9-LABEL: @reduction_half8( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x half> [[VEC8:%.*]], <8 x half> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x half> [[VEC8]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x half> [[BIN_RDX]], <8 x half> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <8 x half> [[BIN_RDX]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x half> [[BIN_RDX2]], <8 x half> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <8 x half> [[BIN_RDX2]], [[RDX_SHUF3]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x half> [[BIN_RDX4]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[VEC8:%.*]]) ; GFX9-NEXT: ret half [[TMP0]] ; ; VI-LABEL: @reduction_half8( @@ -91,15 +81,7 @@ define half @reduction_half16(<16 x half> %vec16) { ; GFX9-LABEL: @reduction_half16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x half> [[VEC16:%.*]], <16 x half> poison, <16 x i32> -; GFX9-NEXT: [[BIN_RDX:%.*]] = fadd fast <16 x half> [[VEC16]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x half> [[BIN_RDX]], <16 x half> poison, <16 x i32> -; GFX9-NEXT: [[BIN_RDX2:%.*]] = fadd fast <16 x half> [[BIN_RDX]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <16 x half> [[BIN_RDX2]], <16 x half> poison, <16 x i32> -; GFX9-NEXT: [[BIN_RDX4:%.*]] = fadd fast <16 x half> [[BIN_RDX2]], [[RDX_SHUF3]] -; GFX9-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x half> [[BIN_RDX4]], <16 x half> poison, <16 x i32> -; GFX9-NEXT: [[BIN_RDX6:%.*]] = fadd fast <16 x half> [[BIN_RDX4]], [[RDX_SHUF5]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x half> [[BIN_RDX6]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[VEC16:%.*]]) ; GFX9-NEXT: ret half [[TMP0]] ; ; VI-LABEL: @reduction_half16( @@ -203,11 +185,7 @@ define i16 @reduction_v4i16(<4 x i16> %a) { ; GFX9-LABEL: @reduction_v4i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[BIN_RDX:%.*]] = add <4 x i16> [[A]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[BIN_RDX]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <4 x i16> [[BIN_RDX]], [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[BIN_RDX2]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[A:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_v4i16( @@ -237,13 +215,7 @@ define i16 @reduction_v8i16(<8 x i16> %vec8) { ; GFX9-LABEL: @reduction_v8i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX:%.*]] = add <8 x i16> [[VEC8]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX2:%.*]] = add <8 x i16> [[BIN_RDX]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX2]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[BIN_RDX4:%.*]] = add <8 x i16> [[BIN_RDX2]], [[RDX_SHUF3]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[BIN_RDX4]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[VEC8:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_v8i16( @@ -289,13 +261,7 @@ define i16 @reduction_umin_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: @reduction_umin_v4i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <4 x i16> [[VEC4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> [[VEC4:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_umin_v4i16( @@ -331,16 +297,7 @@ define i16 @reduction_icmp_v8i16(<8 x i16> %vec8) { ; GFX9-LABEL: @reduction_icmp_v8i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[VEC8:%.*]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <8 x i16> [[VEC8]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP]], <8 x i16> [[VEC8]], <8 x i16> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP2]], <8 x i16> [[RDX_MINMAX_SELECT]], <8 x i16> [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> poison, <8 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp ult <8 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <8 x i1> [[RDX_MINMAX_CMP5]], <8 x i16> [[RDX_MINMAX_SELECT3]], <8 x i16> [[RDX_SHUF4]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <8 x i16> [[RDX_MINMAX_SELECT6]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[VEC8:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_icmp_v8i16( @@ -402,19 +359,7 @@ define i16 @reduction_smin_v16i16(<16 x i16> %vec16) { ; GFX9-LABEL: @reduction_smin_v16i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i16> [[VEC16:%.*]], <16 x i16> poison, <16 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <16 x i16> [[VEC16]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP]], <16 x i16> [[VEC16]], <16 x i16> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> poison, <16 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP2]], <16 x i16> [[RDX_MINMAX_SELECT]], <16 x i16> [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> poison, <16 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT3]], [[RDX_SHUF4]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP5]], <16 x i16> [[RDX_MINMAX_SELECT3]], <16 x i16> [[RDX_SHUF4]] -; GFX9-NEXT: [[RDX_SHUF7:%.*]] = shufflevector <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> poison, <16 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP8:%.*]] = icmp slt <16 x i16> [[RDX_MINMAX_SELECT6]], [[RDX_SHUF7]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT9:%.*]] = select <16 x i1> [[RDX_MINMAX_CMP8]], <16 x i16> [[RDX_MINMAX_SELECT6]], <16 x i16> [[RDX_SHUF7]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <16 x i16> [[RDX_MINMAX_SELECT9]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> [[VEC16:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_smin_v16i16( @@ -530,13 +475,7 @@ define i16 @reduction_umax_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: @reduction_umax_v4i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i16> [[VEC4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp ugt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> [[VEC4:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_umax_v4i16( @@ -572,13 +511,7 @@ define i16 @reduction_smax_v4i16(<4 x i16> %vec4) { ; GFX9-LABEL: @reduction_smax_v4i16( ; GFX9-NEXT: entry: -; GFX9-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i16> [[VEC4:%.*]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <4 x i16> [[VEC4]], [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i16> [[VEC4]], <4 x i16> [[RDX_SHUF]] -; GFX9-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> poison, <4 x i32> -; GFX9-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp sgt <4 x i16> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; GFX9-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i16> [[RDX_MINMAX_SELECT]], <4 x i16> [[RDX_SHUF1]] -; GFX9-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[RDX_MINMAX_SELECT3]], i32 0 +; GFX9-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> [[VEC4:%.*]]) ; GFX9-NEXT: ret i16 [[TMP0]] ; ; VI-LABEL: @reduction_smax_v4i16(