Index: clang/lib/Basic/SourceManager.cpp =================================================================== --- clang/lib/Basic/SourceManager.cpp +++ clang/lib/Basic/SourceManager.cpp @@ -1256,6 +1256,11 @@ #include #endif +static constexpr inline bool hasless(unsigned long x, unsigned long n) { + // See http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord + return (((x)-~0UL/255*(n))&~(x)&~0UL/255*128); +} + LineOffsetMapping LineOffsetMapping::get(llvm::MemoryBufferRef Buffer, llvm::BumpPtrAllocator &Alloc) { // Find the file offsets of all of the *physical* source lines. This does @@ -1269,9 +1274,44 @@ const unsigned char *End = (const unsigned char *)Buffer.getBufferEnd(); const std::size_t BufLen = End - Buf; unsigned I = 0; + unsigned long Word; + + constexpr char NewLineBound = std::max('\r', '\n'); + + // scan sizeof(Word) bytes at a time for new lines. + // This is much faster than scanning each byte independently. + if(BufLen > sizeof(Word)) { + while (I < BufLen - sizeof(Word)) { + memcpy(&Word, Buf + I, sizeof(Word)); + // no new line => jump over sizeof(Word) bytes. + if(!hasless(Word, 1 + NewLineBound)) { + I += sizeof(Word); + continue; + } + + // Otherwise scan for the next newline - and we know there's at least one. + do { + if (Buf[I] == '\n') { + LineOffsets.push_back(I + 1); + ++I; + break; + } else if (Buf[I] == '\r') { + // If this is \r\n, skip both characters. + if (I + 1 < BufLen && Buf[I + 1] == '\n') + ++I; + LineOffsets.push_back(I + 1); + ++I; + break; + } + ++I; + } while(1); + } + } + + // Handle tail using a regular check. while (I < BufLen) { // Use a fast check to catch both newlines - if (LLVM_UNLIKELY(Buf[I] <= std::max('\n', '\r'))) { + if (LLVM_UNLIKELY(Buf[I] <= NewLineBound)) { if (Buf[I] == '\n') { LineOffsets.push_back(I + 1); } else if (Buf[I] == '\r') {