@@ -27970,28 +27970,45 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27970
27970
OpMask.size() % RootMask.size() == 0) ||
27971
27971
OpMask.size() == RootMask.size()) &&
27972
27972
"The smaller number of elements must divide the larger.");
27973
- int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27974
- int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27975
- int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27976
- assert(((RootRatio == 1 && OpRatio == 1) ||
27977
- (RootRatio == 1) != (OpRatio == 1)) &&
27973
+
27974
+ // This function can be performance-critical, so we rely on the power-of-2
27975
+ // knowledge that we have about the mask sizes to replace div/rem ops with
27976
+ // bit-masks and shifts.
27977
+ assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
27978
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
27979
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
27980
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
27981
+
27982
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
27983
+ unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
27984
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
27985
+ assert((RootRatio == 1 || OpRatio == 1) &&
27978
27986
"Must not have a ratio for both incoming and op masks!");
27979
27987
27980
- SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27988
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
27989
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
27990
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
27991
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
27992
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
27993
+
27994
+ SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
27981
27995
27982
27996
// Merge this shuffle operation's mask into our accumulated mask. Note that
27983
27997
// this shuffle's mask will be the first applied to the input, followed by the
27984
27998
// root mask to get us all the way to the root value arrangement. The reason
27985
27999
// for this order is that we are recursing up the operation chain.
27986
- for (int i = 0; i < MaskWidth; ++i) {
27987
- int RootIdx = i / RootRatio ;
28000
+ for (unsigned i = 0; i < MaskWidth; ++i) {
28001
+ unsigned RootIdx = i >> RootRatioLog2 ;
27988
28002
if (RootMask[RootIdx] < 0) {
27989
28003
// This is a zero or undef lane, we're done.
27990
28004
Mask[i] = RootMask[RootIdx];
27991
28005
continue;
27992
28006
}
27993
28007
27994
- int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
28008
+ // TODO: Here and below, we could convert multiply to shift-left for
28009
+ // performance because we know that our mask sizes are power-of-2.
28010
+ unsigned RootMaskedIdx =
28011
+ RootMask[RootIdx] * RootRatio + (i & (RootRatio - 1));
27995
28012
27996
28013
// Just insert the scaled root mask value if it references an input other
27997
28014
// than the SrcOp we're currently inserting.
@@ -28001,9 +28018,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28001
28018
continue;
28002
28019
}
28003
28020
28004
- RootMaskedIdx %= MaskWidth;
28021
+ RootMaskedIdx = RootMaskedIdx & ( MaskWidth - 1) ;
28005
28022
28006
- int OpIdx = RootMaskedIdx / OpRatio ;
28023
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2 ;
28007
28024
if (OpMask[OpIdx] < 0) {
28008
28025
// The incoming lanes are zero or undef, it doesn't matter which ones we
28009
28026
// are using.
@@ -28012,8 +28029,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28012
28029
}
28013
28030
28014
28031
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28015
- int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
28016
- OpMaskedIdx %= MaskWidth;
28032
+ unsigned OpMaskedIdx =
28033
+ OpMask[OpIdx] * OpRatio + (RootMaskedIdx & (OpRatio - 1));
28034
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28017
28035
28018
28036
if (OpMask[OpIdx] < (int)OpMask.size()) {
28019
28037
assert(0 <= InputIdx0 && "Unknown target shuffle input");
0 commit comments