Diff 512859

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,142 Lines • ▼ Show 20 Lines	public:
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for		/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.		/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
void buildTree(ArrayRef<Value *> Roots,		void buildTree(ArrayRef<Value *> Roots,
const SmallDenseSet<Value *> &UserIgnoreLst);		const SmallDenseSet<Value *> &UserIgnoreLst);

/// Construct a vectorizable tree that starts at \p Roots.		/// Construct a vectorizable tree that starts at \p Roots.
void buildTree(ArrayRef<Value *> Roots);		void buildTree(ArrayRef<Value *> Roots);

/// Checks if the very first tree node is going to be vectorized.
bool isVectorizedFirstNode() const {
return !VectorizableTree.empty() &&
VectorizableTree.front()->State == TreeEntry::Vectorize;
}

/// Returns the main instruction for the very first node.
Instruction *getFirstNodeMainOp() const {
assert(!VectorizableTree.empty() && "No tree to get the first node from");
return VectorizableTree.front()->getMainOp();
}

/// Returns whether the root node has in-tree uses.		/// Returns whether the root node has in-tree uses.
bool doesRootHaveInTreeUses() const {		bool doesRootHaveInTreeUses() const {
return !VectorizableTree.empty() &&		return !VectorizableTree.empty() &&
!VectorizableTree.front()->UserTreeIndices.empty();		!VectorizableTree.front()->UserTreeIndices.empty();
}		}

/// Return the scalars of the root node.		/// Return the scalars of the root node.
ArrayRef<Value *> getRootNodeScalars() const {		ArrayRef<Value *> getRootNodeScalars() const {
▲ Show 20 Lines • Show All 12,164 Lines • ▼ Show 20 Lines	for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
}		}
V.buildExternalUses(LocalExternallyUsedValues);		V.buildExternalUses(LocalExternallyUsedValues);

V.computeMinimumValueSizes();		V.computeMinimumValueSizes();

// Estimate cost.		// Estimate cost.
InstructionCost TreeCost = V.getTreeCost(VL);		InstructionCost TreeCost = V.getTreeCost(VL);
InstructionCost ReductionCost =		InstructionCost ReductionCost =
getReductionCost(TTI, VL, ReduxWidth, RdxFMF);		getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
if (V.isVectorizedFirstNode() && isa<LoadInst>(VL.front())) {
Instruction *MainOp = V.getFirstNodeMainOp();
for (Value *V : VL) {
auto *VI = dyn_cast<LoadInst>(V);
// Add the costs of scalar GEP pointers, to be removed from the
// code.
if (!VI \|\| VI == MainOp)
continue;
auto *Ptr = dyn_cast<GetElementPtrInst>(VI->getPointerOperand());
if (!Ptr \|\| !Ptr->hasOneUse() \|\| Ptr->hasAllConstantIndices())
continue;
TreeCost -= TTI->getArithmeticInstrCost(
Instruction::Add, Ptr->getType(), TTI::TCK_RecipThroughput);
}
}
InstructionCost Cost = TreeCost + ReductionCost;		InstructionCost Cost = TreeCost + ReductionCost;
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");		LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n");
if (!Cost.isValid())		if (!Cost.isValid())
return nullptr;		return nullptr;
if (Cost >= -SLPCostThreshold) {		if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {		V.getORE()->emit([&]() {
return OptimizationRemarkMissed(		return OptimizationRemarkMissed(
SV_NAME, "HorSLPNotBeneficial",		SV_NAME, "HorSLPNotBeneficial",
▲ Show 20 Lines • Show All 219 Lines • ▼ Show 20 Lines	#endif
}		}
return VectorizedTree;		return VectorizedTree;
}		}

private:		private:
/// Calculate the cost of a reduction.		/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,		InstructionCost getReductionCost(TargetTransformInfo *TTI,
ArrayRef<Value *> ReducedVals,		ArrayRef<Value *> ReducedVals,
unsigned ReduxWidth, FastMathFlags FMF) {		bool IsCmpSelMinMax, unsigned ReduxWidth,
		FastMathFlags FMF) {
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;		TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
Value *FirstReducedVal = ReducedVals.front();		Value *FirstReducedVal = ReducedVals.front();
Type *ScalarTy = FirstReducedVal->getType();		Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);		FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost = 0, ScalarCost;		InstructionCost VectorCost = 0, ScalarCost;
// If all of the reduced values are constant, the vector cost is 0, since		// If all of the reduced values are constant, the vector cost is 0, since
// the reduction value can be calculated at the compile time.		// the reduction value can be calculated at the compile time.
bool AllConsts = allConstant(ReducedVals);		bool AllConsts = allConstant(ReducedVals);
		auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
		InstructionCost Cost = 0;
		// Scalar cost is repeated for N-1 elements.
		int Cnt = ReducedVals.size();
		for (Value *RdxVal : ReducedVals) {
		if (Cnt == 1)
		break;
		--Cnt;
		if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
		Cost += GenCostFn();
		continue;
		}
		InstructionCost ScalarCost = 0;
		for (User *U : RdxVal->users()) {
		auto *RdxOp = cast<Instruction>(U);
		if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
		ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
		continue;
		}
		ScalarCost = InstructionCost::getInvalid();
		break;
		}
		if (ScalarCost.isValid())
		Cost += ScalarCost;
		else
		Cost += GenCostFn();
		}
		return Cost;
		};
switch (RdxKind) {		switch (RdxKind) {
case RecurKind::Add:		case RecurKind::Add:
case RecurKind::Mul:		case RecurKind::Mul:
case RecurKind::Or:		case RecurKind::Or:
case RecurKind::And:		case RecurKind::And:
case RecurKind::Xor:		case RecurKind::Xor:
case RecurKind::FAdd:		case RecurKind::FAdd:
case RecurKind::FMul: {		case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);		unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
if (!AllConsts)		if (!AllConsts)
VectorCost =		VectorCost =
TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);		TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);		ScalarCost = EvaluateScalarCost([&]() {
		return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
		});
break;		break;
}		}
case RecurKind::FMax:		case RecurKind::FMax:
case RecurKind::FMin: {		case RecurKind::FMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);		auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
if (!AllConsts) {		if (!AllConsts) {
auto *VecCondTy =		auto *VecCondTy =
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));		cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
VectorCost =		VectorCost =
TTI->getMinMaxReductionCost(VectorTy, VecCondTy,		TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
/IsUnsigned=/false, CostKind);		/IsUnsigned=/false, CostKind);
}		}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);		CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,		ScalarCost = EvaluateScalarCost([&]() {
SclCondTy, RdxPred, CostKind) +		return TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy, SclCondTy,
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,		RdxPred, CostKind) +
SclCondTy, RdxPred, CostKind);		TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
		RdxPred, CostKind);
		});
break;		break;
}		}
case RecurKind::SMax:		case RecurKind::SMax:
case RecurKind::SMin:		case RecurKind::SMin:
case RecurKind::UMax:		case RecurKind::UMax:
case RecurKind::UMin: {		case RecurKind::UMin: {
auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);		auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
if (!AllConsts) {		if (!AllConsts) {
auto *VecCondTy =		auto *VecCondTy =
cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));		cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned =		bool IsUnsigned =
RdxKind == RecurKind::UMax \|\| RdxKind == RecurKind::UMin;		RdxKind == RecurKind::UMax \|\| RdxKind == RecurKind::UMin;
VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,		VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
IsUnsigned, CostKind);		IsUnsigned, CostKind);
}		}
CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);		CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy,		ScalarCost = EvaluateScalarCost([&]() {
SclCondTy, RdxPred, CostKind) +		return TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy, SclCondTy,
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,		RdxPred, CostKind) +
SclCondTy, RdxPred, CostKind);		TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, SclCondTy,
		RdxPred, CostKind);
		});
break;		break;
}		}
default:		default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");		llvm_unreachable("Expected arithmetic or min/max reduction operation");
}		}

// Scalar cost is repeated for N-1 elements.
ScalarCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost		LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal		<< " for reduction that starts with " << *FirstReducedVal
<< " (It is a splitting reduction)\n");		<< " (It is a splitting reduction)\n");
return VectorCost - ScalarCost;		return VectorCost - ScalarCost;
}		}

/// Emit a horizontal reduction of the vectorized value.		/// Emit a horizontal reduction of the vectorized value.
Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder,		Value emitReduction(Value VectorizedValue, IRBuilder<> &Builder,
▲ Show 20 Lines • Show All 1,122 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT,SSE		; RUN: opt < %s -mtriple=x86_64-unknown-linux -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX		; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT,AVX2		; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=core-avx2 -passes=slp-vectorizer -S \| FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 \| FileCheck %s --check-prefixes=CHECK,THRESH		; RUN: opt < %s -mtriple=x86_64-unknown-linux -mcpu=skx -passes=slp-vectorizer -S -slp-threshold=-100 \| FileCheck %s --check-prefixes=CHECK,THRESH

@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16		@arr = local_unnamed_addr global [32 x i32] zeroinitializer, align 16
@arr1 = local_unnamed_addr global [32 x float] zeroinitializer, align 16		@arr1 = local_unnamed_addr global [32 x float] zeroinitializer, align 16
@arrp = local_unnamed_addr global [32 x ptr] zeroinitializer, align 16		@arrp = local_unnamed_addr global [32 x ptr] zeroinitializer, align 16
@var = global i32 zeroinitializer, align 8		@var = global i32 zeroinitializer, align 8

declare i32 @llvm.smax.i32(i32, i32)		declare i32 @llvm.smax.i32(i32, i32)
▲ Show 20 Lines • Show All 1,094 Lines • ▼ Show 20 Lines	;
%m76 = tail call i16 @llvm.smin.i16(i16 %t7, i16 %t6)		%m76 = tail call i16 @llvm.smin.i16(i16 %t7, i16 %t6)
%m3210 = tail call i16 @llvm.smin.i16(i16 %m32, i16 %m10)		%m3210 = tail call i16 @llvm.smin.i16(i16 %m32, i16 %m10)
%m7654 = tail call i16 @llvm.smin.i16(i16 %m76, i16 %m54)		%m7654 = tail call i16 @llvm.smin.i16(i16 %m76, i16 %m54)
%m = tail call i16 @llvm.smin.i16(i16 %m7654, i16 %m3210)		%m = tail call i16 @llvm.smin.i16(i16 %m7654, i16 %m3210)
ret i16 %m		ret i16 %m
}		}

define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {		define i64 @umax_intrinsic_rdx_v4i64(ptr %p0) {
; SSE-LABEL: @umax_intrinsic_rdx_v4i64(		; DEFAULT-LABEL: @umax_intrinsic_rdx_v4i64(
; SSE-NEXT: [[P1:%.]] = getelementptr inbounds i64, ptr [[P0:%.]], i64 1		; DEFAULT-NEXT: [[P1:%.]] = getelementptr inbounds i64, ptr [[P0:%.]], i64 1
; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2		; DEFAULT-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3		; DEFAULT-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
; SSE-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4		; DEFAULT-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
; SSE-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4		; DEFAULT-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
; SSE-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4		; DEFAULT-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
; SSE-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4		; DEFAULT-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
; SSE-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])		; DEFAULT-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
; SSE-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])		; DEFAULT-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
; SSE-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])		; DEFAULT-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
; SSE-NEXT: ret i64 [[M]]		; DEFAULT-NEXT: ret i64 [[M]]
		RKSimonUnsubmitted Not Done Reply Inline Actions Makes sense: https://gcc.godbolt.org/z/fKbGnzEr8 RKSimon: Makes sense: https://gcc.godbolt.org/z/fKbGnzEr8
;
; AVX-LABEL: @umax_intrinsic_rdx_v4i64(
; AVX-NEXT: [[P1:%.]] = getelementptr inbounds i64, ptr [[P0:%.]], i64 1
; AVX-NEXT: [[P2:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 2
; AVX-NEXT: [[P3:%.*]] = getelementptr inbounds i64, ptr [[P0]], i64 3
; AVX-NEXT: [[T0:%.*]] = load i64, ptr [[P0]], align 4
; AVX-NEXT: [[T1:%.*]] = load i64, ptr [[P1]], align 4
; AVX-NEXT: [[T2:%.*]] = load i64, ptr [[P2]], align 4
; AVX-NEXT: [[T3:%.*]] = load i64, ptr [[P3]], align 4
; AVX-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]])
; AVX-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]])
; AVX-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]])
; AVX-NEXT: ret i64 [[M]]
;
; AVX2-LABEL: @umax_intrinsic_rdx_v4i64(
; AVX2-NEXT: [[TMP1:%.]] = load <4 x i64>, ptr [[P0:%.]], align 4
; AVX2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
; AVX2-NEXT: ret i64 [[TMP2]]
;		;
; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(		; THRESH-LABEL: @umax_intrinsic_rdx_v4i64(
; THRESH-NEXT: [[TMP1:%.]] = load <4 x i64>, ptr [[P0:%.]], align 4		; THRESH-NEXT: [[TMP1:%.]] = load <4 x i64>, ptr [[P0:%.]], align 4
; THRESH-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])		; THRESH-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> [[TMP1]])
; THRESH-NEXT: ret i64 [[TMP2]]		; THRESH-NEXT: ret i64 [[TMP2]]
;		;
%p1 = getelementptr inbounds i64, ptr %p0, i64 1		%p1 = getelementptr inbounds i64, ptr %p0, i64 1
%p2 = getelementptr inbounds i64, ptr %p0, i64 2		%p2 = getelementptr inbounds i64, ptr %p0, i64 2
▲ Show 20 Lines • Show All 96 Lines • Show Last 20 Lines

llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

Show All 15 Lines
;		;
%2 = load i32, ptr @arr, align 16		%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4		%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = call i32 @llvm.smax.i32(i32 %2, i32 %3)		%4 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
ret i32 %4		ret i32 %4
}		}

define i32 @smax_v4i32(i32) {		define i32 @smax_v4i32(i32) {
; CHECK-LABEL: @smax_v4i32(		; SSE-LABEL: @smax_v4i32(
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16		; SSE-NEXT: [[TMP2:%.*]] = load i32, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])		; SSE-NEXT: [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
; CHECK-NEXT: ret i32 [[TMP3]]		; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
		; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
		; SSE-NEXT: [[TMP6:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP2]], i32 [[TMP3]])
		; SSE-NEXT: [[TMP7:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP6]], i32 [[TMP4]])
		; SSE-NEXT: [[TMP8:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP7]], i32 [[TMP5]])
		; SSE-NEXT: ret i32 [[TMP8]]
		;
		; AVX-LABEL: @smax_v4i32(
		; AVX-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr @arr, align 16
		; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP2]])
		; AVX-NEXT: ret i32 [[TMP3]]
		RKSimonUnsubmitted Not Done Reply Inline Actions This looks to be about right: https://gcc.godbolt.org/z/sq99696Y7 You can add additional SSE test levels if you want to be certain? RKSimon: This looks to be about right: https://gcc.godbolt.org/z/sq99696Y7 You can add additional SSE…
		ABataevAuthorUnsubmitted Done Reply Inline Actions You mean add some extra tests for smin/umin/umax/fmin/fmax? ABataev: You mean add some extra tests for smin/umin/umax/fmin/fmax?
		RKSimonUnsubmitted Not Done Reply Inline Actions No - extra SSE test levels - I've added them at rG162284b2e1a970a01144d1d8e7f8d4fd1e03c5bf RKSimon: No - extra SSE test levels - I've added them at rG162284b2e1a970a01144d1d8e7f8d4fd1e03c5bf
;		;
%2 = load i32, ptr @arr, align 16		%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4		%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8		%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4		%5 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 3), align 4
%6 = call i32 @llvm.smax.i32(i32 %2, i32 %3)		%6 = call i32 @llvm.smax.i32(i32 %2, i32 %3)
%7 = call i32 @llvm.smax.i32(i32 %6, i32 %4)		%7 = call i32 @llvm.smax.i32(i32 %6, i32 %4)
%8 = call i32 @llvm.smax.i32(i32 %7, i32 %5)		%8 = call i32 @llvm.smax.i32(i32 %7, i32 %5)
Show All 19 Lines	;
%12 = call i32 @llvm.smax.i32(i32 %11, i32 %5)		%12 = call i32 @llvm.smax.i32(i32 %11, i32 %5)
%13 = call i32 @llvm.smax.i32(i32 %12, i32 %6)		%13 = call i32 @llvm.smax.i32(i32 %12, i32 %6)
%14 = call i32 @llvm.smax.i32(i32 %13, i32 %7)		%14 = call i32 @llvm.smax.i32(i32 %13, i32 %7)
%15 = call i32 @llvm.smax.i32(i32 %14, i32 %8)		%15 = call i32 @llvm.smax.i32(i32 %14, i32 %8)
%16 = call i32 @llvm.smax.i32(i32 %15, i32 %9)		%16 = call i32 @llvm.smax.i32(i32 %15, i32 %9)
ret i32 %16		ret i32 %16
}		}

define i32 @smax_v16i32(i32) {		define i32 @smax_v16i32(i32) {
		RKSimonUnsubmitted Not Done Reply Inline Actions CMP+CMOV is quick even on ancient x86 - the smax.i32 throughput cost of 1 is realistic. The issue is the predicted smax.v16i32 reduction cost, which is currently 33 (based on expansion of costs in getMinMaxReductionCost), but realistically is closer to 12 cycles (based off some quick llvm-mca tests) RKSimon: CMP+CMOV is quick even on ancient x86 - the smax.i32 throughput cost of 1 is realistic. The…
		ABataevAuthorUnsubmitted Done Reply Inline Actions Can you fix it? ABataev: Can you fix it?
		RKSimonUnsubmitted Not Done Reply Inline Actions I'll try to fix some of the obvious issues to unstick this patch, but a more complete fix will take more time. RKSimon: I'll try to fix some of the obvious issues to unstick this patch, but a more complete fix will…
		RKSimonUnsubmitted Not Done Reply Inline Actions Please can you rebase after rG63c3895327839ba5b57f5b99ec9e888abf976ac6 ? RKSimon: Please can you rebase after rG63c3895327839ba5b57f5b99ec9e888abf976ac6 ?
; CHECK-LABEL: @smax_v16i32(		; CHECK-LABEL: @smax_v16i32(
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16		; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i32>, ptr @arr, align 16
; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])		; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> [[TMP2]])
; CHECK-NEXT: ret i32 [[TMP3]]		; CHECK-NEXT: ret i32 [[TMP3]]
;		;
%2 = load i32, ptr @arr, align 16		%2 = load i32, ptr @arr, align 16
%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4		%3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8		%4 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
Show All 22 Lines	;
%27 = call i32 @llvm.smax.i32(i32 %26, i32 %12)		%27 = call i32 @llvm.smax.i32(i32 %26, i32 %12)
%28 = call i32 @llvm.smax.i32(i32 %27, i32 %13)		%28 = call i32 @llvm.smax.i32(i32 %27, i32 %13)
%29 = call i32 @llvm.smax.i32(i32 %28, i32 %14)		%29 = call i32 @llvm.smax.i32(i32 %28, i32 %14)
%30 = call i32 @llvm.smax.i32(i32 %29, i32 %15)		%30 = call i32 @llvm.smax.i32(i32 %29, i32 %15)
%31 = call i32 @llvm.smax.i32(i32 %30, i32 %16)		%31 = call i32 @llvm.smax.i32(i32 %30, i32 %16)
%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)		%32 = call i32 @llvm.smax.i32(i32 %31, i32 %17)
ret i32 %32		ret i32 %32
}		}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX: {{.*}}
; SSE: {{.*}}

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Improve reduction cost model for scalars.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 512859

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SLP]Improve reduction cost model for scalars.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 512859

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll

llvm/test/Transforms/SLPVectorizer/X86/horizontal-smax.ll

[SLP]Improve reduction cost model for scalars.
ClosedPublic