Skip to content

Commit ce0b995

Browse files
committedJun 14, 2017
[x86] replace div/rem with shift/mask for better shuffle combining perf
We know that shuffle masks are power-of-2 sizes, but there's no way (?) for LLVM to know that, so hack combineX86ShufflesRecursively() to be much faster by replacing div/rem with shift/mask. This makes the motivating compile-time bug in PR32037 ( https://bugs.llvm.org/show_bug.cgi?id=32037 ) about 9% faster overall. Differential Revision: https://reviews.llvm.org/D34174 llvm-svn: 305398
1 parent 4a911c8 commit ce0b995

File tree

1 file changed

+31
-13
lines changed

1 file changed

+31
-13
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+31-13
Original file line numberDiff line numberDiff line change
@@ -27970,28 +27970,45 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
2797027970
OpMask.size() % RootMask.size() == 0) ||
2797127971
OpMask.size() == RootMask.size()) &&
2797227972
"The smaller number of elements must divide the larger.");
27973-
int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27974-
int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27975-
int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27976-
assert(((RootRatio == 1 && OpRatio == 1) ||
27977-
(RootRatio == 1) != (OpRatio == 1)) &&
27973+
27974+
// This function can be performance-critical, so we rely on the power-of-2
27975+
// knowledge that we have about the mask sizes to replace div/rem ops with
27976+
// bit-masks and shifts.
27977+
assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
27978+
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
27979+
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
27980+
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
27981+
27982+
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
27983+
unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
27984+
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
27985+
assert((RootRatio == 1 || OpRatio == 1) &&
2797827986
"Must not have a ratio for both incoming and op masks!");
2797927987

27980-
SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27988+
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
27989+
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
27990+
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
27991+
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
27992+
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
27993+
27994+
SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
2798127995

2798227996
// Merge this shuffle operation's mask into our accumulated mask. Note that
2798327997
// this shuffle's mask will be the first applied to the input, followed by the
2798427998
// root mask to get us all the way to the root value arrangement. The reason
2798527999
// for this order is that we are recursing up the operation chain.
27986-
for (int i = 0; i < MaskWidth; ++i) {
27987-
int RootIdx = i / RootRatio;
28000+
for (unsigned i = 0; i < MaskWidth; ++i) {
28001+
unsigned RootIdx = i >> RootRatioLog2;
2798828002
if (RootMask[RootIdx] < 0) {
2798928003
// This is a zero or undef lane, we're done.
2799028004
Mask[i] = RootMask[RootIdx];
2799128005
continue;
2799228006
}
2799328007

27994-
int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
28008+
// TODO: Here and below, we could convert multiply to shift-left for
28009+
// performance because we know that our mask sizes are power-of-2.
28010+
unsigned RootMaskedIdx =
28011+
RootMask[RootIdx] * RootRatio + (i & (RootRatio - 1));
2799528012

2799628013
// Just insert the scaled root mask value if it references an input other
2799728014
// than the SrcOp we're currently inserting.
@@ -28001,9 +28018,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
2800128018
continue;
2800228019
}
2800328020

28004-
RootMaskedIdx %= MaskWidth;
28021+
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
2800528022

28006-
int OpIdx = RootMaskedIdx / OpRatio;
28023+
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
2800728024
if (OpMask[OpIdx] < 0) {
2800828025
// The incoming lanes are zero or undef, it doesn't matter which ones we
2800928026
// are using.
@@ -28012,8 +28029,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
2801228029
}
2801328030

2801428031
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28015-
int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
28016-
OpMaskedIdx %= MaskWidth;
28032+
unsigned OpMaskedIdx =
28033+
OpMask[OpIdx] * OpRatio + (RootMaskedIdx & (OpRatio - 1));
28034+
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
2801728035

2801828036
if (OpMask[OpIdx] < (int)OpMask.size()) {
2801928037
assert(0 <= InputIdx0 && "Unknown target shuffle input");

0 commit comments

Comments
 (0)
Please sign in to comment.