Changeset View
Standalone View
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 6,832 Lines • ▼ Show 20 Lines | bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) { | ||||
// The reduction root is used as the insertion point for new instructions, | // The reduction root is used as the insertion point for new instructions, | ||||
// so set it as externally used to prevent it from being deleted. | // so set it as externally used to prevent it from being deleted. | ||||
ExternallyUsedValues[ReductionRoot]; | ExternallyUsedValues[ReductionRoot]; | ||||
SmallVector<Value *, 16> IgnoreList; | SmallVector<Value *, 16> IgnoreList; | ||||
for (ReductionOpsType &RdxOp : ReductionOps) | for (ReductionOpsType &RdxOp : ReductionOps) | ||||
IgnoreList.append(RdxOp.begin(), RdxOp.end()); | IgnoreList.append(RdxOp.begin(), RdxOp.end()); | ||||
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); | |||||
if (NumReducedVals > ReduxWidth) { | |||||
// In the loop below, we are building a tree based on a window of | |||||
// 'ReduxWidth' values. | |||||
// If the operands of those values have common traits (compare predicate, | |||||
// constant operand, etc), then we want to group those together to | |||||
// minimize the cost of the reduction. | |||||
// TODO: This should be extended to count common operands for | |||||
// compares and binops. | |||||
// Step 1: Count the number of times each compare predicate occurs. | |||||
SmallDenseMap<unsigned, unsigned> PredCountMap; | |||||
for (Value *RdxVal : ReducedVals) { | |||||
CmpInst::Predicate Pred; | |||||
if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value()))) | |||||
++PredCountMap[Pred]; | |||||
} | |||||
// Step 2: Sort the values so the most common predicates come first. | |||||
stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) { | |||||
ABataev: `stable_sort`? | |||||
CmpInst::Predicate PredA, PredB; | |||||
if (match(A, m_Cmp(PredA, m_Value(), m_Value())) && | |||||
match(B, m_Cmp(PredB, m_Value(), m_Value()))) { | |||||
return PredCountMap[PredA] > PredCountMap[PredB]; | |||||
} | |||||
return false; | |||||
}); | |||||
} | |||||
Value *VectorizedTree = nullptr; | Value *VectorizedTree = nullptr; | ||||
unsigned i = 0; | unsigned i = 0; | ||||
unsigned ReduxWidth = PowerOf2Floor(NumReducedVals); | |||||
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { | while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { | ||||
ArrayRef<Value *> VL = makeArrayRef(&ReducedVals[i], ReduxWidth); | ArrayRef<Value *> VL = makeArrayRef(&ReducedVals[i], ReduxWidth); | ||||
V.buildTree(VL, ExternallyUsedValues, IgnoreList); | V.buildTree(VL, ExternallyUsedValues, IgnoreList); | ||||
Optional<ArrayRef<unsigned>> Order = V.bestOrder(); | Optional<ArrayRef<unsigned>> Order = V.bestOrder(); | ||||
// TODO: Handle orders of size less than number of elements in the vector. | // TODO: Handle orders of size less than number of elements in the vector. | ||||
if (Order && Order->size() == VL.size()) { | if (Order && Order->size() == VL.size()) { | ||||
// TODO: reorder tree nodes without tree rebuilding. | // TODO: reorder tree nodes without tree rebuilding. | ||||
SmallVector<Value *, 4> ReorderedOps(VL.size()); | SmallVector<Value *, 4> ReorderedOps(VL.size()); | ||||
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines | while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { | ||||
// Update the final value in the reduction. | // Update the final value in the reduction. | ||||
Builder.SetCurrentDebugLocation(Loc); | Builder.SetCurrentDebugLocation(Loc); | ||||
OperationData VectReductionData(ReductionData.getOpcode(), | OperationData VectReductionData(ReductionData.getOpcode(), | ||||
VectorizedTree, ReducedSubTree, | VectorizedTree, ReducedSubTree, | ||||
ReductionData.getKind()); | ReductionData.getKind()); | ||||
VectorizedTree = | VectorizedTree = | ||||
VectReductionData.createOp(Builder, "op.rdx", ReductionOps); | VectReductionData.createOp(Builder, "op.rdx", ReductionOps); | ||||
} | } | ||||
i += ReduxWidth; | i += ReduxWidth; | ||||
Not Done ReplyInline ActionsMaybe, better to change this expression too? The main problem here is that the sliding window has step ReduxWidth. Maybe, use 1 here in common case? Also, I don't think sorting is safe here. It is better to keep the original order of the instructions, but pre-select only the matching ones. ABataev: Maybe, better to change this expression too? The main problem here is that the sliding window… | |||||
I'm not sure what the suggestion is exactly - if we allow any reduction width, will we effectively re-open D59710 and all of its problems (miscompiles, perf loss, etc)? Do you have an idea why sorting is not safe? If I'm seeing it correctly, we have guaranteed that all ops in the reduction are associative and additionally they are all in the same basic block. Therefore, they should be safe to reorder (the reduction itself requires that property?). spatel: I'm not sure what the suggestion is exactly - if we allow any reduction width, will we… | |||||
Not Done ReplyInline Actions
ABataev: 1. Ah, yes, I missed that it works only in case of the successful vectorization attempt. What I… | |||||
Sure - I think that is challenging given the current restrictions in this patch because we also have this limitation in buildTree_rec(): // Check that all of the compares have the same predicate. But there's a loophole for swapped predicate, so I added a test with that possibility. I think this also shows that it is possible to degrade the reduction if the IR is in some non-standard form, but I'm hoping that is not the common case. The better solution (assuming this part does not cause trouble) will be to also sort based on the operand values (that's the TODO comment I left in the code). spatel: Sure - I think that is challenging given the current restrictions in this patch because we also… | |||||
Not Done ReplyInline Actions
ABataev: 1. Agreed, might be a good candidate for future work.
2. I think we need to implement both… | |||||
ReduxWidth = PowerOf2Floor(NumReducedVals - i); | ReduxWidth = PowerOf2Floor(NumReducedVals - i); | ||||
} | } | ||||
if (VectorizedTree) { | if (VectorizedTree) { | ||||
// Finish the reduction. | // Finish the reduction. | ||||
for (; i < NumReducedVals; ++i) { | for (; i < NumReducedVals; ++i) { | ||||
auto *I = cast<Instruction>(ReducedVals[i]); | auto *I = cast<Instruction>(ReducedVals[i]); | ||||
Builder.SetCurrentDebugLocation(I->getDebugLoc()); | Builder.SetCurrentDebugLocation(I->getDebugLoc()); | ||||
▲ Show 20 Lines • Show All 785 Lines • Show Last 20 Lines |
stable_sort?