Changeset View
Standalone View
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 122 Lines • ▼ Show 20 Lines | |||||
static cl::opt<bool> ShouldStartVectorizeHorAtStore( | static cl::opt<bool> ShouldStartVectorizeHorAtStore( | ||||
"slp-vectorize-hor-store", cl::init(false), cl::Hidden, | "slp-vectorize-hor-store", cl::init(false), cl::Hidden, | ||||
cl::desc( | cl::desc( | ||||
"Attempt to vectorize horizontal reductions feeding into a store")); | "Attempt to vectorize horizontal reductions feeding into a store")); | ||||
static cl::opt<int> | static cl::opt<int> | ||||
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, | MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, | ||||
cl::desc("Attempt to vectorize for this register size in bits")); | cl::desc("Attempt to vectorize for this register size in bits")); | ||||
vdmitrie: Would the following description
"Allow optimization of original scalar identity operations on… | |||||
static cl::opt<unsigned> | static cl::opt<unsigned> | ||||
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, | MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, | ||||
cl::desc("Maximum SLP vectorization factor (0=unlimited)")); | cl::desc("Maximum SLP vectorization factor (0=unlimited)")); | ||||
static cl::opt<int> | static cl::opt<int> | ||||
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, | MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden, | ||||
cl::desc("Maximum depth of the lookup for consecutive stores.")); | cl::desc("Maximum depth of the lookup for consecutive stores.")); | ||||
▲ Show 20 Lines • Show All 10,937 Lines • ▼ Show 20 Lines | auto &&CheckOperands = [this, IsCmpSelMinMax, | ||||
ExtraArgs.push_back(EdgeVal); | ExtraArgs.push_back(EdgeVal); | ||||
continue; | continue; | ||||
} | } | ||||
// If the edge is not an instruction, or it is different from the main | // If the edge is not an instruction, or it is different from the main | ||||
// reduction opcode or has too many uses - possible reduced value. | // reduction opcode or has too many uses - possible reduced value. | ||||
if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || | if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind || | ||||
IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || | IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || | ||||
!hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || | !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || | ||||
!isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { | !isVectorizable(getRdxKind(EdgeInst), EdgeInst)) { | ||||
Not Done ReplyInline Actionsnit: noticed by chance while was refreshing with the code. This call to getRdxKind can be safely replaced with RdxKind. vdmitrie: nit: noticed by chance while was refreshing with the code. This call to getRdxKind can be… | |||||
PossibleReducedVals.push_back(EdgeVal); | PossibleReducedVals.push_back(EdgeVal); | ||||
continue; | continue; | ||||
} | } | ||||
ReductionOps.push_back(EdgeInst); | ReductionOps.push_back(EdgeInst); | ||||
} | } | ||||
}; | }; | ||||
// Try to regroup reduced values so that it gets more profitable to try to | // Try to regroup reduced values so that it gets more profitable to try to | ||||
// reduce them. Values are grouped by their value ids, instructions - by | // reduce them. Values are grouped by their value ids, instructions - by | ||||
▲ Show 20 Lines • Show All 99 Lines • ▼ Show 20 Lines | public: | ||||
/// Attempt to vectorize the tree found by matchAssociativeReduction. | /// Attempt to vectorize the tree found by matchAssociativeReduction. | ||||
Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { | Value *tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { | ||||
constexpr int ReductionLimit = 4; | constexpr int ReductionLimit = 4; | ||||
constexpr unsigned RegMaxNumber = 4; | constexpr unsigned RegMaxNumber = 4; | ||||
constexpr unsigned RedValsMaxNumber = 128; | constexpr unsigned RedValsMaxNumber = 128; | ||||
// If there are a sufficient number of reduction values, reduce | // If there are a sufficient number of reduction values, reduce | ||||
// to a nearby power-of-2. We can safely generate oversized | // to a nearby power-of-2. We can safely generate oversized | ||||
// vectors and rely on the backend to split them to legal sizes. | // vectors and rely on the backend to split them to legal sizes. | ||||
unsigned NumReducedVals = std::accumulate( | unsigned NumReducedVals = std::accumulate( | ||||
Not Done ReplyInline Actionsnit: maybe make it unsigned too (+ deduction rule)? vdmitrie: nit: maybe make it unsigned too (+ deduction rule)? | |||||
ReducedVals.begin(), ReducedVals.end(), 0, | ReducedVals.begin(), ReducedVals.end(), 0, | ||||
[](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); }); | [](int Num, ArrayRef<Value *> Vals) { return Num + Vals.size(); }); | ||||
if (NumReducedVals < ReductionLimit) | if (NumReducedVals < ReductionLimit && | ||||
Not Done ReplyInline ActionsUse allConstant() ? RKSimon: Use allConstant() ? | |||||
all_of(ReducedVals, [](ArrayRef<Value *> RedV) { | |||||
return RedV.size() < 2 || | |||||
!all_of(RedV, [&](Value *V) { return isConstant(V); }) || | |||||
!isSplat(RedV); | |||||
})) | |||||
return nullptr; | return nullptr; | ||||
IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); | IRBuilder<> Builder(cast<Instruction>(ReductionRoot)); | ||||
// Track the reduced values in case if they are replaced by extractelement | // Track the reduced values in case if they are replaced by extractelement | ||||
// because of the vectorization. | // because of the vectorization. | ||||
DenseMap<Value *, WeakTrackingVH> TrackedVals; | DenseMap<Value *, WeakTrackingVH> TrackedVals; | ||||
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; | BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; | ||||
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines | for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { | ||||
SmallVector<int> Mask; | SmallVector<int> Mask; | ||||
if (isFixedVectorShuffle(CommonCandidates, Mask)) { | if (isFixedVectorShuffle(CommonCandidates, Mask)) { | ||||
++I; | ++I; | ||||
Candidates.swap(CommonCandidates); | Candidates.swap(CommonCandidates); | ||||
ShuffledExtracts = true; | ShuffledExtracts = true; | ||||
} | } | ||||
} | } | ||||
} | } | ||||
// Emit code for constant values. | |||||
if (Candidates.size() > 1 && all_of(Candidates, isConstant)) { | |||||
Value *Res = Candidates.front(); | |||||
Not Done ReplyInline ActionsallConstant(Candidates) RKSimon: allConstant(Candidates) | |||||
++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond(); | |||||
for (Value *V : makeArrayRef(Candidates).drop_front()) { | |||||
Res = createOp(Builder, RdxKind, Res, V, "const.rdx", ReductionOps); | |||||
++VectorizedVals.try_emplace(V, 0).first->getSecond(); | |||||
} | |||||
if (!VectorizedTree) { | |||||
// Initialize the final value in the reduction. | |||||
VectorizedTree = Res; | |||||
} else { | |||||
// Update the final value in the reduction. | |||||
Builder.SetCurrentDebugLocation( | |||||
cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); | |||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, Res, | |||||
"op.rdx", ReductionOps); | |||||
} | |||||
continue; | |||||
Not Done ReplyInline ActionsThis code pattern is seen multiple times. Any chance to outline it? vdmitrie: This code pattern is seen multiple times. Any chance to outline it?
+ (for better readability)… | |||||
} | |||||
unsigned NumReducedVals = Candidates.size(); | unsigned NumReducedVals = Candidates.size(); | ||||
if (NumReducedVals < ReductionLimit) | if (NumReducedVals < ReductionLimit && | ||||
(NumReducedVals < 2 || !isSplat(Candidates))) | |||||
continue; | continue; | ||||
// Gather same values. | |||||
Not Done ReplyInline ActionsThis comment line seems to be misplaced. vdmitrie: This comment line seems to be misplaced. | |||||
MapVector<Value *, unsigned> SameValuesCounter; | |||||
for (Value *V : Candidates) | |||||
Not Done ReplyInline ActionsWe don't need to collect repeating counters if IsSupportedreusedRdxOp is false. vdmitrie: We don't need to collect repeating counters if IsSupportedreusedRdxOp is false. | |||||
No, still need it, it is used to produce vector of unique scalars, see line 12577. ABataev: No, still need it, it is used to produce vector of unique scalars, see line 12577. | |||||
++SameValuesCounter.insert(std::make_pair(V, 0)).first->second; | |||||
// Check if we have repeated values. | |||||
bool IsSupportedReusedRdxOp = RdxKind != RecurKind::Mul && | |||||
RdxKind != RecurKind::FMul && | |||||
RdxKind != RecurKind::FMulAdd; | |||||
Not Done ReplyInline ActionsThis code shall form a member of HorizontalReduction class and emitReusedOps() need to assert it can handle a case by calling the former before doing any processing. vdmitrie: This code shall form a member of HorizontalReduction class and emitReusedOps() need to assert… | |||||
bool SameScaleFactor = false; | |||||
Not Done ReplyInline ActionsThis needs a comment. Are you trying to catch a case for special processing when each value used same times? Please describe what is the benefit of catching such cases? vdmitrie: This needs a comment. Are you trying to catch a case for special processing when each value… | |||||
It may help to produce better vector ops. Say, we have 8 x aabbccdd. It will require reduction of 8 elements + the built tree still will operate on 4 x element, but the last node will have reshuffle from 4 x vector to 8 x vector + red(8 x ). Instead we may have just red(4 x abcd)*2. ABataev: It may help to produce better vector ops. Say, we have 8 x aabbccdd. It will require reduction… | |||||
bool HasReusedScalars = SameValuesCounter.size() != Candidates.size(); | |||||
if (IsSupportedReusedRdxOp && HasReusedScalars) { | |||||
Not Done ReplyInline Actions"IsSupportedReusedRdxOp && HasReusedScalars" pattern is the most common use. SameValuesCounter.size() != NumReducedVals;` and just check for OptReusedScalars everywhere across the code? vdmitrie: "IsSupportedReusedRdxOp && HasReusedScalars" pattern is the most common use.
What about… | |||||
SameScaleFactor = | |||||
(RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || | |||||
RdxKind == RecurKind::Xor) && | |||||
all_of(SameValuesCounter, | |||||
Not Done ReplyInline Actionsdrop_front() ? vdmitrie: drop_front() ? | |||||
Not Done ReplyInline Actions
This was a "nit" actually. drop_front() needs ArrayRef, so it should look like the following: probably not worth the effort to save on just one comparison. vdmitrie: > drop_front() ?
This was a "nit" actually. drop_front() needs ArrayRef, so it should look… | |||||
Not Done ReplyInline ActionsUhh, I did not realize takeVector clears the map. `all_of(drop_begin(SameValuesCounter), [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { return P.second == SameValuesCounter.begin()->second; })` vdmitrie: Uhh, I did not realize takeVector clears the map.
finally... this one should do the trick… | |||||
[&SameValuesCounter](const std::pair<Value *, unsigned> &P) { | |||||
return P.second == SameValuesCounter.front().second; | |||||
}); | |||||
Candidates.resize(SameValuesCounter.size()); | |||||
transform(SameValuesCounter, Candidates.begin(), | |||||
[](const auto &P) { return P.first; }); | |||||
NumReducedVals = Candidates.size(); | |||||
// Have a reduction of the same element. | |||||
if (NumReducedVals == 1) { | |||||
Value *RedVal = emitReusedOps(Candidates.front(), Builder, None, | |||||
SameValuesCounter, TrackedToOrig); | |||||
if (!VectorizedTree) { | |||||
// Initialize the final value in the reduction. | |||||
VectorizedTree = RedVal; | |||||
} else { | |||||
// Update the final value in the reduction. | |||||
Builder.SetCurrentDebugLocation( | |||||
cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); | |||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, | |||||
"op.rdx", ReductionOps); | |||||
} | |||||
Value *OrigV = TrackedToOrig.find(Candidates.front())->second; | |||||
VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); | |||||
continue; | |||||
} | |||||
} | |||||
unsigned MaxVecRegSize = V.getMaxVecRegSize(); | unsigned MaxVecRegSize = V.getMaxVecRegSize(); | ||||
unsigned EltSize = V.getVectorElementSize(Candidates[0]); | unsigned EltSize = V.getVectorElementSize(Candidates[0]); | ||||
unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); | unsigned MaxElts = RegMaxNumber * PowerOf2Floor(MaxVecRegSize / EltSize); | ||||
unsigned ReduxWidth = std::min<unsigned>( | unsigned ReduxWidth = std::min<unsigned>( | ||||
PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); | PowerOf2Floor(NumReducedVals), std::max(RedValsMaxNumber, MaxElts)); | ||||
unsigned Start = 0; | unsigned Start = 0; | ||||
unsigned Pos = Start; | unsigned Pos = Start; | ||||
Show All 12 Lines | for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { | ||||
} | } | ||||
++Pos; | ++Pos; | ||||
if (Pos < NumReducedVals - ReduxWidth + 1) | if (Pos < NumReducedVals - ReduxWidth + 1) | ||||
return IsAnyRedOpGathered; | return IsAnyRedOpGathered; | ||||
Pos = Start; | Pos = Start; | ||||
ReduxWidth /= 2; | ReduxWidth /= 2; | ||||
return IsAnyRedOpGathered; | return IsAnyRedOpGathered; | ||||
}; | }; | ||||
bool AnyVectorized = false; | |||||
while (Pos < NumReducedVals - ReduxWidth + 1 && | while (Pos < NumReducedVals - ReduxWidth + 1 && | ||||
ReduxWidth >= ReductionLimit) { | ReduxWidth >= ReductionLimit) { | ||||
// Dependency in tree of the reduction ops - drop this attempt, try | // Dependency in tree of the reduction ops - drop this attempt, try | ||||
// later. | // later. | ||||
if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && | if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && | ||||
Start == 0) { | Start == 0) { | ||||
CheckForReusedReductionOps = true; | CheckForReusedReductionOps = true; | ||||
break; | break; | ||||
Show All 36 Lines | for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { | ||||
if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) | if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) | ||||
continue; | continue; | ||||
for_each(ReducedVals[Cnt], | for_each(ReducedVals[Cnt], | ||||
[&LocalExternallyUsedValues, &TrackedVals](Value *V) { | [&LocalExternallyUsedValues, &TrackedVals](Value *V) { | ||||
if (isa<Instruction>(V)) | if (isa<Instruction>(V)) | ||||
LocalExternallyUsedValues[TrackedVals[V]]; | LocalExternallyUsedValues[TrackedVals[V]]; | ||||
}); | }); | ||||
} | } | ||||
if (!IsSupportedReusedRdxOp && HasReusedScalars) { | |||||
// Number of uses of the candidates in the vector of values. | // Number of uses of the candidates in the vector of values. | ||||
Not Done ReplyInline ActionsAre you trying to repurpose the data? vdmitrie: Are you trying to repurpose the data?
To be honest, I'd refrain from doing that. | |||||
SmallDenseMap<Value *, unsigned> NumUses; | SameValuesCounter.clear(); | ||||
for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { | for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { | ||||
Value *V = Candidates[Cnt]; | Value *V = Candidates[Cnt]; | ||||
if (NumUses.count(V) > 0) | if (SameValuesCounter.count(V) > 0) | ||||
continue; | continue; | ||||
NumUses[V] = std::count(VL.begin(), VL.end(), V); | Value *OrigV = TrackedToOrig.find(V)->second; | ||||
SameValuesCounter[OrigV] = std::count(VL.begin(), VL.end(), V); | |||||
} | } | ||||
for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { | for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { | ||||
Value *V = Candidates[Cnt]; | Value *V = Candidates[Cnt]; | ||||
Not Done ReplyInline ActionsFuse these loops? ` for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { if (Cnt >= Pos && Cnt < Pos + ReduxWidth) continue; ... }` vdmitrie: Fuse these loops?
` for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
if (Cnt >= Pos… | |||||
if (NumUses.count(V) > 0) | if (SameValuesCounter.count(V) > 0) | ||||
continue; | continue; | ||||
NumUses[V] = std::count(VL.begin(), VL.end(), V); | Value *OrigV = TrackedToOrig.find(V)->second; | ||||
SameValuesCounter[OrigV] = std::count(VL.begin(), VL.end(), V); | |||||
} | |||||
} | } | ||||
// Gather externally used values. | // Gather externally used values. | ||||
SmallPtrSet<Value *, 4> Visited; | SmallPtrSet<Value *, 4> Visited; | ||||
for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { | for (unsigned Cnt = 0; Cnt < Pos; ++Cnt) { | ||||
Value *V = Candidates[Cnt]; | Value *V = Candidates[Cnt]; | ||||
if (!Visited.insert(V).second) | if (!Visited.insert(V).second) | ||||
continue; | continue; | ||||
unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; | Value *OrigV = TrackedToOrig.find(V)->second; | ||||
unsigned NumOps = VectorizedVals.lookup(V) + SameValuesCounter[OrigV]; | |||||
if (NumOps != ReducedValsToOps.find(V)->second.size()) | if (NumOps != ReducedValsToOps.find(V)->second.size()) | ||||
LocalExternallyUsedValues[V]; | LocalExternallyUsedValues[V]; | ||||
} | } | ||||
for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { | for (unsigned Cnt = Pos + ReduxWidth; Cnt < NumReducedVals; ++Cnt) { | ||||
Value *V = Candidates[Cnt]; | Value *V = Candidates[Cnt]; | ||||
if (!Visited.insert(V).second) | if (!Visited.insert(V).second) | ||||
continue; | continue; | ||||
unsigned NumOps = VectorizedVals.lookup(V) + NumUses[V]; | Value *OrigV = TrackedToOrig.find(V)->second; | ||||
unsigned NumOps = VectorizedVals.lookup(V) + SameValuesCounter[OrigV]; | |||||
Not Done ReplyInline ActionsFuse these loops? vdmitrie: Fuse these loops? | |||||
if (NumOps != ReducedValsToOps.find(V)->second.size()) | if (NumOps != ReducedValsToOps.find(V)->second.size()) | ||||
LocalExternallyUsedValues[V]; | LocalExternallyUsedValues[V]; | ||||
} | } | ||||
V.buildExternalUses(LocalExternallyUsedValues); | V.buildExternalUses(LocalExternallyUsedValues); | ||||
V.computeMinimumValueSizes(); | V.computeMinimumValueSizes(); | ||||
// Intersect the fast-math-flags from all reduction operations. | // Intersect the fast-math-flags from all reduction operations. | ||||
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines | for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { | ||||
Builder.SetInsertPoint(RdxRootInst); | Builder.SetInsertPoint(RdxRootInst); | ||||
// To prevent poison from leaking across what used to be sequential, | // To prevent poison from leaking across what used to be sequential, | ||||
// safe, scalar boolean logic operations, the reduction operand must be | // safe, scalar boolean logic operations, the reduction operand must be | ||||
// frozen. | // frozen. | ||||
if (isBoolLogicOp(RdxRootInst)) | if (isBoolLogicOp(RdxRootInst)) | ||||
VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); | VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); | ||||
// Emit code to correctly handle reused reduced values, if required. | |||||
if (IsSupportedReusedRdxOp && HasReusedScalars && !SameScaleFactor) { | |||||
VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, VL, | |||||
SameValuesCounter, TrackedToOrig); | |||||
} | |||||
Value *ReducedSubTree = | Value *ReducedSubTree = | ||||
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); | emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); | ||||
// Improved analysis for add/fadd/xor reductions with same scale factor | |||||
// for all operands of reductions. We can emit scalar ops for them | |||||
// instead. | |||||
if (IsSupportedReusedRdxOp && HasReusedScalars && SameScaleFactor) { | |||||
MapVector<Value *, unsigned> RedCounter; | |||||
RedCounter.insert( | |||||
std::make_pair(ReducedSubTree, SameValuesCounter.front().second)); | |||||
DenseMap<Value *, Value *> RedToOrig( | |||||
{{ReducedSubTree, ReducedSubTree}}); | |||||
ReducedSubTree = emitReusedOps(ReducedSubTree, Builder, None, | |||||
RedCounter, RedToOrig); | |||||
} | |||||
if (!VectorizedTree) { | if (!VectorizedTree) { | ||||
// Initialize the final value in the reduction. | // Initialize the final value in the reduction. | ||||
VectorizedTree = ReducedSubTree; | VectorizedTree = ReducedSubTree; | ||||
} else { | } else { | ||||
// Update the final value in the reduction. | // Update the final value in the reduction. | ||||
Builder.SetCurrentDebugLocation( | Builder.SetCurrentDebugLocation( | ||||
cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); | cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); | ||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, | VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, | ||||
ReducedSubTree, "op.rdx", ReductionOps); | ReducedSubTree, "op.rdx", ReductionOps); | ||||
} | } | ||||
// Count vectorized reduced values to exclude them from final reduction. | // Count vectorized reduced values to exclude them from final reduction. | ||||
for (Value *V : VL) | for (Value *V : VL) { | ||||
++VectorizedVals.try_emplace(TrackedToOrig.find(V)->second, 0) | Value *OrigV = TrackedToOrig.find(V)->second; | ||||
.first->getSecond(); | if (IsSupportedReusedRdxOp) { | ||||
VectorizedVals.try_emplace(OrigV, SameValuesCounter[OrigV]); | |||||
continue; | |||||
} | |||||
++VectorizedVals.try_emplace(OrigV, 0).first->getSecond(); | |||||
} | |||||
Pos += ReduxWidth; | Pos += ReduxWidth; | ||||
Start = Pos; | Start = Pos; | ||||
ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); | ReduxWidth = PowerOf2Floor(NumReducedVals - Pos); | ||||
AnyVectorized = true; | |||||
} | |||||
if (IsSupportedReusedRdxOp && HasReusedScalars && !AnyVectorized) { | |||||
for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { | |||||
Value *RedVal = emitReusedOps(P.first, Builder, None, | |||||
SameValuesCounter, TrackedToOrig); | |||||
if (!VectorizedTree) { | |||||
// Initialize the final value in the reduction. | |||||
VectorizedTree = RedVal; | |||||
} else { | |||||
// Update the final value in the reduction. | |||||
Builder.SetCurrentDebugLocation( | |||||
cast<Instruction>(ReductionOps.front().front())->getDebugLoc()); | |||||
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree, RedVal, | |||||
"op.rdx", ReductionOps); | |||||
} | |||||
Value *OrigV = TrackedToOrig.find(P.first)->second; | |||||
VectorizedVals.try_emplace(OrigV, P.second); | |||||
} | |||||
continue; | |||||
} | } | ||||
} | } | ||||
if (VectorizedTree) { | if (VectorizedTree) { | ||||
// Reorder operands of bool logical op in the natural order to avoid | // Reorder operands of bool logical op in the natural order to avoid | ||||
// possible problem with poison propagation. If not possible to reorder | // possible problem with poison propagation. If not possible to reorder | ||||
// (both operands are originally RHS), emit an extra freeze instruction | // (both operands are originally RHS), emit an extra freeze instruction | ||||
// for the LHS operand. | // for the LHS operand. | ||||
//I.e., if we have original code like this: | //I.e., if we have original code like this: | ||||
▲ Show 20 Lines • Show All 211 Lines • ▼ Show 20 Lines | Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, | ||||
assert(isPowerOf2_32(ReduxWidth) && | assert(isPowerOf2_32(ReduxWidth) && | ||||
"We only handle power-of-two reductions for now"); | "We only handle power-of-two reductions for now"); | ||||
assert(RdxKind != RecurKind::FMulAdd && | assert(RdxKind != RecurKind::FMulAdd && | ||||
"A call to the llvm.fmuladd intrinsic is not handled yet"); | "A call to the llvm.fmuladd intrinsic is not handled yet"); | ||||
++NumVectorInstructions; | ++NumVectorInstructions; | ||||
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); | return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind); | ||||
} | } | ||||
Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, | |||||
Not Done ReplyInline ActionsPlease add a description. vdmitrie: Please add a description. | |||||
ArrayRef<Value *> VL, | |||||
const MapVector<Value *, unsigned> &SameValuesCounter, | |||||
const DenseMap<Value *, Value *> &TrackedToOrig) { | |||||
switch (RdxKind) { | |||||
case RecurKind::Add: { | |||||
// root = mul prev_root, <1, 1, n, 1> | |||||
if (VL.empty()) { | |||||
unsigned Cnt = SameValuesCounter.lookup( | |||||
TrackedToOrig.find(VectorizedValue)->second); | |||||
Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt); | |||||
return Builder.CreateMul(VectorizedValue, Scale); | |||||
} | |||||
Not Done ReplyInline Actionsokay :-) that works too just a n.i.t. remark: you could reduce arguments to just these if the code was a dedicated method: vdmitrie: okay :-) that works too
just a n.i.t. remark: you could reduce arguments to just these if the… | |||||
It is not possible, both SameValuesCounter and TrackedToOrig are used for each elements of VL in the second switch to build correct constant vector. ABataev: It is not possible, both SameValuesCounter and TrackedToOrig are used for each elements of VL… | |||||
Not Done ReplyInline ActionsI don't see that. ` unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(VectorizedValue)->second); ` vdmitrie: I don't see that.
Isn't the following is the only use of them in this part (under condition VL. | |||||
Not Done ReplyInline ActionsAh, I missed that you said about second switch. I was thinking about first part of the method. ... here goes the code which is now under VL.empty() ... } vdmitrie: Ah, I missed that you said about second switch. I was thinking about first part of the method. | |||||
SmallVector<Constant *> Vals; | |||||
for (Value *V : VL) { | |||||
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); | |||||
Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false)); | |||||
} | |||||
auto *Scale = ConstantVector::get(Vals); | |||||
return Builder.CreateMul(VectorizedValue, Scale); | |||||
} | |||||
case RecurKind::And: | |||||
case RecurKind::Or: | |||||
// No need for multiple or/and(s). | |||||
return VectorizedValue; | |||||
case RecurKind::SMax: | |||||
case RecurKind::SMin: | |||||
case RecurKind::UMax: | |||||
case RecurKind::UMin: | |||||
case RecurKind::FMax: | |||||
Not Done ReplyInline ActionsWhat I basically see here is that you packed two methods into one and differentiate them by VL argument. vdmitrie: What I basically see here is that you packed two methods into one and differentiate them by VL… | |||||
I thought about it, just in many cases we reuse the same code, so I decided to put it into single switch (just (f)add and xor have special processing). ABataev: I thought about it, just in many cases we reuse the same code, so I decided to put it into… | |||||
Not Done ReplyInline ActionsI don't buy this kind of savings. Ideally we want to assert that we have one of add/fadd/xor reduction kind when we fall into same scale factor optimization. This is why I advocate for splitting it in two. We do not save a lot on switch which will be handing just 3 cases and default one for the second method. Code inside the switch cases is already separate. So it is easy to do from the very beginning. vdmitrie: I don't buy this kind of savings. Ideally we want to assert that we have one of add/fadd/xor… | |||||
case RecurKind::FMin: | |||||
// No need for multiple min/max(s) of the same value. | |||||
return VectorizedValue; | |||||
case RecurKind::Xor: { | |||||
// Replace values with even number of repeats with 0, since | |||||
// x xor x = 0. | |||||
// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, | |||||
// 7>, if elements 4th and 6th elements have even number of repeats. | |||||
if (VL.empty()) { | |||||
unsigned Cnt = SameValuesCounter.lookup( | |||||
TrackedToOrig.find(VectorizedValue)->second); | |||||
if (Cnt % 2 == 0) | |||||
return Constant::getNullValue(VectorizedValue->getType()); | |||||
return VectorizedValue; | |||||
} | |||||
SmallVector<int> Mask( | |||||
cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(), | |||||
UndefMaskElem); | |||||
std::iota(Mask.begin(), Mask.end(), 0); | |||||
bool NeedShuffle = false; | |||||
for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { | |||||
Value *V = VL[I]; | |||||
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); | |||||
if (Cnt % 2 == 0) { | |||||
Mask[I] = VF; | |||||
NeedShuffle = true; | |||||
} | |||||
} | |||||
if (NeedShuffle) | |||||
VectorizedValue = Builder.CreateShuffleVector( | |||||
VectorizedValue, | |||||
ConstantVector::getNullValue(VectorizedValue->getType()), Mask); | |||||
return VectorizedValue; | |||||
} | |||||
case RecurKind::FAdd: { | |||||
// root = fmul prev_root, <1.0, 1.0, n.0, 1.0> | |||||
if (VL.empty()) { | |||||
unsigned Cnt = SameValuesCounter.lookup( | |||||
TrackedToOrig.find(VectorizedValue)->second); | |||||
Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt); | |||||
return Builder.CreateFMul(VectorizedValue, Scale); | |||||
} | |||||
SmallVector<Constant *> Vals; | |||||
for (Value *V : VL) { | |||||
unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second); | |||||
Vals.push_back(ConstantFP::get(V->getType(), Cnt)); | |||||
} | |||||
auto *Scale = ConstantVector::get(Vals); | |||||
return Builder.CreateFMul(VectorizedValue, Scale); | |||||
} | |||||
case RecurKind::Mul: | |||||
case RecurKind::FMul: | |||||
case RecurKind::FMulAdd: | |||||
case RecurKind::SelectICmp: | |||||
case RecurKind::SelectFCmp: | |||||
case RecurKind::None: | |||||
llvm_unreachable("Unexpected reduction kind for reused scalars."); | |||||
} | |||||
} | |||||
}; | }; | ||||
} // end anonymous namespace | } // end anonymous namespace | ||||
Not Done ReplyInline ActionsAre you relying on instcombiner to optimize this further? I mean for example mul X, 2 -> add X, X vdmitrie: Are you relying on instcombiner to optimize this further? I mean for example mul X, 2 -> add X… | |||||
Yes, because there might be some other numbers, say 3, 4, etc. ABataev: Yes, because there might be some other numbers, say 3, 4, etc. | |||||
static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { | static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { | ||||
if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) | if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) | ||||
return cast<FixedVectorType>(IE->getType())->getNumElements(); | return cast<FixedVectorType>(IE->getType())->getNumElements(); | ||||
unsigned AggregateSize = 1; | unsigned AggregateSize = 1; | ||||
auto *IV = cast<InsertValueInst>(InsertInst); | auto *IV = cast<InsertValueInst>(InsertInst); | ||||
Type *CurrentType = IV->getType(); | Type *CurrentType = IV->getType(); | ||||
do { | do { | ||||
if (auto *ST = dyn_cast<StructType>(CurrentType)) { | if (auto *ST = dyn_cast<StructType>(CurrentType)) { | ||||
for (auto *Elt : ST->elements()) | for (auto *Elt : ST->elements()) | ||||
if (Elt != ST->getElementType(0)) // check homogeneity | if (Elt != ST->getElementType(0)) // check homogeneity | ||||
return None; | return None; | ||||
Not Done ReplyInline ActionsMay be reorganize it a bit to add a return statement? It can produce warning "control reaches end of non-void function [-Wreturn-type]" vdmitrie: May be reorganize it a bit to add a return statement? It can produce warning "control reaches… | |||||
AggregateSize *= ST->getNumElements(); | AggregateSize *= ST->getNumElements(); | ||||
CurrentType = ST->getElementType(0); | CurrentType = ST->getElementType(0); | ||||
} else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { | } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { | ||||
AggregateSize *= AT->getNumElements(); | AggregateSize *= AT->getNumElements(); | ||||
CurrentType = AT->getElementType(); | CurrentType = AT->getElementType(); | ||||
} else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { | } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) { | ||||
AggregateSize *= VT->getNumElements(); | AggregateSize *= VT->getNumElements(); | ||||
return AggregateSize; | return AggregateSize; | ||||
▲ Show 20 Lines • Show All 968 Lines • Show Last 20 Lines |
Would the following description
"Allow optimization of original scalar identity operations on matched horizontal reductions."
sound better?
Same question wrt AllowSameScalarReduction variable name. Will AllowHorRdxIdenityOptimization fit better?
It probably makes sense to add a note (as a comment above or into the commit message): will the optimization run if we match a reduction but do not vectorize at the end? Based on test updates it looks like it will run. IMO that should be clearly stated that the pass outcome might not be a vectorized code.