This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/include/llvm/Support/
-
include/
-
llvm/
-
Support/
28/28
LEB128.h

Differential D78796

[Support] Refactor LEB128 encoding into an input iterator
AcceptedPublic

Authored by nlguillemot on Apr 23 2020, 11:26 PM.

Download Raw Diff

Details

Reviewers

bogner
qcolombet
fhahn
rtereshin
dsanders

Summary

Refactors the logic for LEB128 encoding into a std iterator style input
iterator. This allows the output of LEB128 encoding to be passed into std
algorithms. This allowed refactoring the existing LEB128 encoding
functions to become syntactical sugar for calls to std::copy, and
this patch updates some existing uses of LEB128 encoding to take
advantage of the iterator APIs to write cleaner code. Also, To help separate
the iterator syntactical sugar from the LEB128 logic, the contents of the
inner loop of LEB128 encoding was factored out into a separate function.

Diff Detail

Event Timeline

nlguillemot created this revision.Apr 23 2020, 11:26 PM

nlguillemot added a parent revision: D78795: [Support] Add raw_ostream_iterator: ostream_iterator for raw_ostream..

nlguillemot added a child revision: D78797: [Support] Refactor LEB128 decoding into an output iterator.Apr 23 2020, 11:30 PM

The end goal of this series of patches is to support encoding/decoding APInt to/from [U|S]LEB128. This patch is an initial step in that direction. It refactors the logic to avoid code duplication and makes the interface to LEB128 encoding much more generic, as shown by the existing cases becoming very short.

In a later patch, I'd like to do some template magic to support LEB128InputIterator<APInt> in a way that tries to share code with the uint64_t/int64_t implementations.

Refactored the encode functions more aggressively by factoring out the common logic for writing to an array or to a raw_ostream:

diff
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 729ee5ca745..f65dc919e39 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -168,43 +168,52 @@ public:
   ///@}
 };
 
+/// Utility function to encode a SLEB128 or ULEB128 value to a buffer. Returns
+/// the length in bytes of the encoded value.
+template <class ValueT>
+unsigned encodeLEB128(const ValueT &Value, bool IsSigned, uint8_t *p,
+                      unsigned PadTo = 0) {
+  uint8_t *orig_p = p;
+  p = std::copy(LEB128InputIterator<ValueT>(Value, IsSigned, PadTo),
+                LEB128InputIterator<ValueT>(), p);
+  return (unsigned)(p - orig_p);
+}
+
+/// Utility function to encode a SLEB128 or ULEB128 value to an output stream.
+/// Returns the length in bytes of the encoded value.
+template <class ValueT>
+inline unsigned encodeLEB128(const ValueT &Value, bool IsSigned,
+                             raw_ostream &OS, unsigned PadTo = 0) {
+  uint64_t TellBefore = OS.tell();
+  std::copy(LEB128InputIterator<ValueT>(Value, IsSigned, PadTo),
+            LEB128InputIterator<ValueT>(), raw_ostream_iterator<uint8_t>(OS));
+  return (unsigned)(OS.tell() - TellBefore);
+}
+
 /// Utility function to encode a SLEB128 value to an output stream. Returns
 /// the length in bytes of the encoded value.
 inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS,
                               unsigned PadTo = 0) {
-  uint64_t TellBefore = OS.tell();
-  std::copy(LEB128InputIterator<int64_t>(Value, /* IsSigned */ true, PadTo),
-            LEB128InputIterator<int64_t>(), raw_ostream_iterator<uint8_t>(OS));
-  return (unsigned)(OS.tell() - TellBefore);
+  return encodeLEB128(Value, /* IsSigned */ true, OS, PadTo);
 }

 /// Utility function to encode a SLEB128 value to a buffer. Returns
 /// the length in bytes of the encoded value.
 inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) {
-  uint8_t *orig_p = p;
-  p = std::copy(LEB128InputIterator<int64_t>(Value, /* IsSigned */ true, PadTo),
-                LEB128InputIterator<int64_t>(), p);
-  return (unsigned)(p - orig_p);
+  return encodeLEB128(Value, /* IsSigned */ true, p, PadTo);
 }

 /// Utility function to encode a ULEB128 value to an output stream. Returns
 /// the length in bytes of the encoded value.
 inline unsigned encodeULEB128(uint64_t Value, raw_ostream &OS,
                               unsigned PadTo = 0) {
-  uint64_t TellBefore = OS.tell();
-  std::copy(LEB128InputIterator<uint64_t>(Value, /* IsSigned */ false, PadTo),
-            LEB128InputIterator<uint64_t>(), raw_ostream_iterator<uint8_t>(OS));
-  return (unsigned)(OS.tell() - TellBefore);
+  return encodeLEB128(Value, /* IsSigned */ false, OS, PadTo);
 }

 /// Utility function to encode a ULEB128 value to a buffer. Returns
 /// the length in bytes of the encoded value.
 inline unsigned encodeULEB128(uint64_t Value, uint8_t *p, unsigned PadTo = 0) {
-  uint8_t *orig_p = p;
-  p = std::copy(
-      LEB128InputIterator<uint64_t>(Value, /* IsSigned */ false, PadTo),
-      LEB128InputIterator<uint64_t>(), p);
-  return (unsigned)(p - orig_p);
+  return encodeLEB128(Value, /* IsSigned */ false, p, PadTo);
 }

 /// Utility function to decode a ULEB128 value.

Removed unnecessary inline

diff
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
index 2549ff86ccd..8c70b41d27c 100644
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -182,8 +182,8 @@ unsigned encodeLEB128(const ValueT &Value, bool IsSigned, uint8_t *p,
 /// Utility function to encode a SLEB128 or ULEB128 value to an output stream.
 /// Returns the length in bytes of the encoded value.
 template <class ValueT>
-inline unsigned encodeLEB128(const ValueT &Value, bool IsSigned,
-                             raw_ostream &OS, unsigned PadTo = 0) {
+unsigned encodeLEB128(const ValueT &Value, bool IsSigned, raw_ostream &OS,
+                      unsigned PadTo = 0) {
   uint64_t TellBefore = OS.tell();
   std::copy(LEB128InputIterator<ValueT>(Value, IsSigned, PadTo),
             LEB128InputIterator<ValueT>(), raw_ostream_iterator<uint8_t>(OS));

nlguillemot marked an inline comment as done.Apr 24 2020, 8:59 PM

nlguillemot added inline comments.

llvm/include/llvm/Support/LEB128.h
90	Self-review: This comment is wrong, it's not always ZExt. Whether the padding is ZExt or SExt depends on IsSigned.

Updated comments about zext to include a mention of sext.

diff
diff --git a/llvm/include/llvm/Support/LEB128.h b/llvm/include/llvm/Support/LEB128.h
--- a/llvm/include/llvm/Support/LEB128.h
+++ b/llvm/include/llvm/Support/LEB128.h
@@ -35,7 +35,7 @@ template <class ValueT> class LEB128InputIterator {
   /// Whether there will be more output after the previously outputted byte.
   bool More;

-  /// The output will be zext-ed to this number of bytes if necessary.
+  /// The output will be sext-ed/zext-ed to this number of bytes if necessary.
   unsigned PadTo;

   /// The current number of outputted bytes.
@@ -87,8 +87,9 @@ public:
   /// If IsSigned is true, then it encodes as SLEB128. If it's false, it encodes
   /// as ULEB128.
   ///
-  /// \param PadTo ZExt the output to this number of bytes if fewer than this
-  /// number of bytes have been outputted.
+  /// \param PadTo Pads the output to this number of bytes if fewer than this
+  /// number of bytes have been outputted. If IsSigned is true, then the padding
+  /// is sign-extended. If IsSigned is false, then it's zero-extended.
   explicit LEB128InputIterator(ValueT Value, bool IsSigned, unsigned PadTo)
       : IsEnd(false), Value(std::move(Value)), IsSigned(IsSigned), PadTo(PadTo),
         Count(0) {

nlguillemot added a reviewer: rtereshin.Apr 26 2020, 11:35 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 26 2020, 11:35 AM

dblaikie added a subscriber: dblaikie.Apr 27 2020, 10:42 PM

I'm not sure this is worth a full iterator abstraction - would the uses be that much the worse for it if they had a non-iterator type they had to iterate manually & pull values from? A more simplified iterator abstraction, essentially:

ULEBifier U(Value, IsSigned, PadTo);
while (Optional<char> C = U.next())
  OS.write(*C);

Or something like that.

In D78796#2007033, @dblaikie wrote:
I'm not sure this is worth a full iterator abstraction - would the uses be that much the worse for it if they had a non-iterator type they had to iterate manually & pull values from? A more simplified iterator abstraction, essentially:
ULEBifier U(Value, IsSigned, PadTo);
while (Optional<char> C = U.next())
  OS.write(*C);
Or something like that.

I understand that the std iterator design is not the simplest. It needs a bunch of boilerplate, and the STL iterator APIs have their own quirks that might increase the mental burden for users. With these cons in mind, let me try to justify why I think this design is the right direction.

The main advantage of using the iterator interface is to reuse code in std::/llvm::. For example, in FixedLenDecoderEmitter.cpp, there's the following code:

// Encode and emit the value to filter against.
uint8_t Buffer[16];
unsigned Len = encodeULEB128(Filter.first, Buffer);
Table.insert(Table.end(), Buffer, Buffer + Len);

With the iterator style interface, this turns into a "one-liner" (ok it's really three lines with formatting but hey...) :

// Encode and emit the value to filter against.
std::copy(
  LEB128InputIterator<unsigned>(Filter.first, /* IsSigned */ false, /* PadTo */ 0),
  LEBInputIterator<unsigned>(), std::back_inserter(Table));

At this point, we could simplify it further by make further syntactical improvements:

Make a utility function like makeULEB128InputIterator() to deduce the ValueT template argument, to avoid explicitly passing IsSigned, and to have PadTo = 0 as a default value.
Make a utility function like makeULEB128InputRange() that uses llvm::make_range() to automatically package the end iterator with the begin iterator.

If we had these further improvements, we could then use llvm::copy from STLExtras to get the following code, which is finally truly a "one-liner":

// Encode and emit the value to filter against.
copy(makeULEB128InputRange(Filter.first), std::back_inserter(Table));

This shows that we can use this API to simplify existing code, and we can also reuse existing code in std:: and llvm:: to simplify it further. The standard iterator interface is what enables this.

The advantage of being able to reuse code in std::/llvm:: is also demonstrated in this patch itself: The existing implementations were reduced to "one-liners" of std::copy that only differ by the output iterator.

Another point: If somebody wants to write a loop in the style that you showed with ULEBifier, that can also be done with this interface (though the example below could be improved by using prefix increment):

LEB128InputIterator<unsigned> U(Value, IsSigned, PadTo);
while (U != LEBInputIterator<unsigned>())
  OS.write(*U++);

If we can express the same code with both interface designs, then I think we might as well use the more general design that allows us to build on top of the existing std::/llvm:: algorithms. I think it would be particularly unfortunate if we started with something like ULEBifier, then later we added an iterator style wrapper for it anyways, since that would create code duplication and redundant APIs.

I'm inclined to agree that the patch series as-is doesn't really warrant the iterators as the interface as no callers have been updated. However, I also don't see much that's iterator specific (ULEBifier would be roughly similar code leaving the iterator portion as trivial wrappers on the ULEBifier) and there are a few places (particularly in tablegen) that are emitting LEB's into containers where the loop to add bytes one by one is just noise and something like std::copy(to_uleb(...), std::back_inserter(Table)); would be somewhat nice. The loop could easily be hidden in something like append_uleb(Table, ...) though I don't think there's a strong argument for (or against) iterators.

What I would suggest is separating out the byte-sequence generation into a ULEBifier as David suggested but still keeping the iterator object as a thin adapter (effectively implementing that while loop) to support inter-operation with STL functions like std::copy.

llvm/include/llvm/Support/LEB128.h
26–27	We could fold this into Optional<uint8_t> CurrByte
32–33	It may be worth mentioning that this is needed for types that don't carry signedness like APInt. unsigned/int/etc. wouldn't need it
35–36	This is misleading as the iterator can produce more bytes after this becomes false. I think it's also unnecessary as each time encodeNextByte() shifts Value right by 7 it's bringing in the correct padding bits at the top. We could just keep reading from Value for the padding bytes.
38–42	Would it make sense to have a RemainingBytes that counts down to zero? Is there a reason to keep PadTo and Count separate?
50	Is testing this on each byte measurably slower? It seems unlikely but this is called a lot and I notice that the previous code didn't do it. If it does, it would be good if we can have the template instantiation pick one side or the other
121–136	I think this belongs inside encodeNextByte(). This operator should essentially be something like: assert(CurrByte.hasValue() && "operator++() called on past-the-end LEB128InputIterator"); CurrByte = encodeNextByte(); return this; where CurrByte is Optional<uint8_t>. As noted in another comment, I think special-casing the padding bytes isn't really needed.
155–157	I see no harm in allowing two past-the-end iterators from different sequences to be equal but I wonder if it's necessary/useful. With the ULEBifier object it could give you an end iterator and then this operator would be a plain equality comparison.

In D78796#2025650, @dsanders wrote:

I'm inclined to agree that the patch series as-is doesn't really warrant the iterators as the interface as no callers have been updated. However, I also don't see much that's iterator specific (ULEBifier would be roughly similar code leaving the iterator portion as trivial wrappers on the ULEBifier) and there are a few places (particularly in tablegen) that are emitting LEB's into containers where the loop to add bytes one by one is just noise and something like std::copy(to_uleb(...), std::back_inserter(Table)); would be somewhat nice. The loop could easily be hidden in something like append_uleb(Table, ...) though I don't think there's a strong argument for (or against) iterators.

What I would suggest is separating out the byte-sequence generation into a ULEBifier as David suggested but still keeping the iterator object as a thin adapter (effectively implementing that while loop) to support inter-operation with STL functions like std::copy.

What is the core issue with the iterator interface that makes it desirable to have something like ULEBifier<T> instead?

If it's a matter of user interface design, then I would rather implement ULEBifier<T> in terms of the iterator, since the iterator is more generic.

If it's a matter of separating the iterator-specific boilerplate from the logic, then I would rather do that by refactoring the implementation to put the core logic in private member functions. That would make it more clear to see what is iterator boilerplate and what is not.

In either case, I would rather not have 2 APIs that do the same thing, for the sake of consistency in the codebase and to avoid code duplication.

In D78796#2025835, @nlguillemot wrote:

In D78796#2025650, @dsanders wrote:

I'm inclined to agree that the patch series as-is doesn't really warrant the iterators as the interface as no callers have been updated. However, I also don't see much that's iterator specific (ULEBifier would be roughly similar code leaving the iterator portion as trivial wrappers on the ULEBifier) and there are a few places (particularly in tablegen) that are emitting LEB's into containers where the loop to add bytes one by one is just noise and something like std::copy(to_uleb(...), std::back_inserter(Table)); would be somewhat nice. The loop could easily be hidden in something like append_uleb(Table, ...) though I don't think there's a strong argument for (or against) iterators.

What I would suggest is separating out the byte-sequence generation into a ULEBifier as David suggested but still keeping the iterator object as a thin adapter (effectively implementing that while loop) to support inter-operation with STL functions like std::copy.

What is the core issue with the iterator interface that makes it desirable to have something like ULEBifier<T> instead?

For me it's that iterators reference and indirect into the elements of a container. They shouldn't be the container themselves

In D78796#2026009, @dsanders wrote:

In D78796#2025835, @nlguillemot wrote:

In D78796#2025650, @dsanders wrote:

I'm inclined to agree that the patch series as-is doesn't really warrant the iterators as the interface as no callers have been updated. However, I also don't see much that's iterator specific (ULEBifier would be roughly similar code leaving the iterator portion as trivial wrappers on the ULEBifier) and there are a few places (particularly in tablegen) that are emitting LEB's into containers where the loop to add bytes one by one is just noise and something like std::copy(to_uleb(...), std::back_inserter(Table)); would be somewhat nice. The loop could easily be hidden in something like append_uleb(Table, ...) though I don't think there's a strong argument for (or against) iterators.

What I would suggest is separating out the byte-sequence generation into a ULEBifier as David suggested but still keeping the iterator object as a thin adapter (effectively implementing that while loop) to support inter-operation with STL functions like std::copy.

What is the core issue with the iterator interface that makes it desirable to have something like ULEBifier<T> instead?

For me it's that iterators reference and indirect into the elements of a container. They shouldn't be the container themselves

If we think of an integer as a container of bits then it's not that different from a normal iterator. In this case we make a copy of the input because it's convenient and makes the interface simpler to use, but we could refactor the code to avoid mutating a copy of the input if that's important.

Hey Nicolas, just add some thoughts on this patch.

llvm/include/llvm/Support/LEB128.h
50	`IsSigned` could easily be changed to a template parameter. Could then turn this is into `if constexpr (IsSigned)` (if you can use C++17), or just SFINAE it. Better yet, use [[ https://en.cppreference.com/w/cpp/types/is_signed \| `is_signed` type trait ]] on `ValueT`.
80	This constructor leaves other data members uninitialized which may be fine for an end iterator, but is a code smell C.41.
93–95	Would be a good idea to initialize `More` in the initializer list just in case something changes in the future and `More` is read before being initially assigned to in `encodeNextByte`. Could also apply to `CurrByte`...
132	If `PadTo` is 0, may have unintentional underflow since it is `unsigned` (this may be you intention though, and if so it's worth a comment). Seeing it a lot in the other code. Maybe underflow is part of the plan here?

Thanks for the comments. Added some replies.

llvm/include/llvm/Support/LEB128.h
35–36	his is misleading as the iterator can produce more bytes after this becomes false. For context, I named it that way because that's how it was named in the original implementation of the code. I agree it could be better.
38–42	For context, the `Count` and `PadTo` variables are there to match the original implementation of the code. Maybe could use a refactoring.
50	Last I heard, we have to support C++14, so no `if constexpr` fanciness allowed. :( Using `is_signed` would work for types like `uint64_t` and `int64_t`, but not `APInt`, so we somehow need to figure out how to handle that case. Seems like there's general agreement that it might as well be a template parameter though.
80	I don't think this breaks that core guideline, since the invariants are set and the object is totally usable, though there should probably be an assert for `!IsEnd` at the start of `operator++` to enforce the API requirements. On the other hand I agree it might be good to initialize the other members anyways, just to be on the safe side, and to avoid having a copy constructor that reads uninitialized memory. The alternative is to hand-write the copy constructor to avoid copying the other fields if `IsEnd` is set, which sounds ugly and less maintainable.
132	If `PadTo` is 0 then it's impossible for the check above of `if (Count < PadTo)` to pass, so we would never hit this line of code. This code should be safe at least. If there are other cases we should double-check them as well.
155–157	The same design issue also happens with existing std input iterators like `std::istream_iterator`, so I think it's acceptable even if slightly odd.

Sorry for the delay coming back to this.

If there's sufficient agreement/justification/push to have an iterator interface, I think it'd be OK/maybe better to just have that, rather than the two - in either wrapping order. I do appreciate the "range-like" object (ULEBifier or whatever it's called) to make it easier to use with range-based algorithms, etc.

Address various review feedback (see relevant comment threads).
Made IsSigned a template parameter.
Moved the core logic of LEB128 encoding into its own function to better separate it from the syntactical sugar.
Added some convenience functions for creating LEB128 input iterator ranges.
Used these new input iterator ranges to simplify some code in FixedLenDecoderEmitter.cpp.

Herald added a subscriber: dexonsmith. · View Herald TranscriptFeb 3 2021, 5:19 PM

Some tweaks and addressing some more review comments.

nlguillemot marked an inline comment as done.Feb 3 2021, 5:34 PM

nlguillemot added inline comments.

llvm/include/llvm/Support/LEB128.h
26–27	Implement in the latest update: Removed the `IsEnd` member and turned `CurrByte` into an `Optional` instead, where `CurrByte == None` means the same thing as `IsEnd == true` did.
32–33	Added a mention of this in the comments.
35–36	Updated the name of `More` to make more sense and slightly simplified the logic surrounding it. Also the padding bits now come from the 7 least significant bits like you suggested.
38–42	I tried implementing `RemainingBytes` instead of `PadTo`/`Count`, but the problem is that we don't actually know how many bytes are remaining because the algorithm encodes LEB128 bytes one-by-one. `PadTo` is usually equal to 0 which results in having no padding.
50	`IsSigned` is now a template parameter.
80	All members are now initialized to sensible values to avoid potential undefined behavior.
93–95	All members are now initialized in all cases.
121–136	Moved the logic inside the core function and now the operator looks like what you described.
132	Avoided any potential underflow by switching the logic to `if (Count + 1 < PadTo)`
155–157	After messing with it for a while I think it's simplest to keep it this way. Mainly because it simplifies the logic under it, since the member-by-member comparison doesn't have to worry about comparing the members of end iterators, or comparing the members of an end iterator with the members of a non-end iterator.

nlguillemot marked an inline comment as done.Feb 3 2021, 5:52 PM

(leaving this for @dsanders to close on whether their feedback has been suitably addressed - feel free to loop me back in if tie breaking or re-approval is required, etc)

LGTM

FWIW, I still feel the encoder state shouldn't live inside the iterator but I am a bit happier that the iterator doesn't own the non-trivial encoder logic anymore. It's more consistent with pointing into a container rather than being the container, even if the pointer is unusually descriptive and the container is really more of a description of all the possible containers disambiguated by references to a specific element. I do wonder why do the logic and not the storage though. All that said, pragmatically I don't really want to insist on that as it's been in review for a very long time now so I think the middle ground is, LGTM as it currently is and if someone needs to use it in a generator-like style at some point they'll be the ones to put detail::encodeLEB128Byte() and the state in an object and have the iterators use that. Similarly, if we grow more places where iterators don't make sense they can either use the existing functions or do the same transformation just described.

This revision is now accepted and ready to land.Feb 8 2021, 3:47 PM

In D78796#2549891, @dsanders wrote:

I do wonder why do the logic and not the storage though.

At some point while working on this I tried putting all the members in a separate struct, but it kinda felt like duplicating code so I decided to leave it as is. Like you say, it might be an interesting refactor.

Revision Contents

Path

Size

llvm/

include/

llvm/

Support/

LEB128.h

262 lines

Diff 259982

llvm/include/llvm/Support/LEB128.h

	Show All 9 Lines
	// ULEB128 values.			// ULEB128 values.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef LLVM_SUPPORT_LEB128_H			#ifndef LLVM_SUPPORT_LEB128_H
	#define LLVM_SUPPORT_LEB128_H			#define LLVM_SUPPORT_LEB128_H

	#include "llvm/Support/raw_ostream.h"			#include "llvm/Support/raw_ostream.h"
				#include <algorithm>
				#include <iterator>

	namespace llvm {			namespace llvm {

	/// Utility function to encode a SLEB128 value to an output stream. Returns			/// Converts a value into bytes encoded in LEB128 style.
	/// the length in bytes of the encoded value.			/// Matches the interface of std input iterators.
	inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS,			template <class ValueT> class LEB128InputIterator {
	unsigned PadTo = 0) {			/// Denotes whether this object represents an end() iterator.
				bool IsEnd;
				dsandersUnsubmitted Done Reply Inline Actions We could fold this into Optional<uint8_t> CurrByte dsanders: We could fold this into Optional<uint8_t> CurrByte
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Implement in the latest update: Removed the `IsEnd` member and turned `CurrByte` into an `Optional` instead, where `CurrByte == None` means the same thing as `IsEnd == true` did. nlguillemot: Implement in the latest update: Removed the `IsEnd` member and turned `CurrByte` into an…

				/// The current value that the iterator is converting into LEB128.
				ValueT Value;

				/// Whether this iterator is outputting a signed or unsigned LEB128 data.
				bool IsSigned;
				dsandersUnsubmitted Done Reply Inline Actions It may be worth mentioning that this is needed for types that don't carry signedness like APInt. unsigned/int/etc. wouldn't need it dsanders: It may be worth mentioning that this is needed for types that don't carry signedness like APInt.
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Added a mention of this in the comments. nlguillemot: Added a mention of this in the comments.

				/// Whether there will be more output after the previously outputted byte.
	bool More;			bool More;
				dsandersUnsubmitted Done Reply Inline Actions This is misleading as the iterator can produce more bytes after this becomes false. I think it's also unnecessary as each time encodeNextByte() shifts Value right by 7 it's bringing in the correct padding bits at the top. We could just keep reading from Value for the padding bytes. dsanders: This is misleading as the iterator can produce more bytes after this becomes false. I think…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions his is misleading as the iterator can produce more bytes after this becomes false. For context, I named it that way because that's how it was named in the original implementation of the code. I agree it could be better. nlguillemot: > his is misleading as the iterator can produce more bytes after this becomes false. For…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Updated the name of `More` to make more sense and slightly simplified the logic surrounding it. Also the padding bits now come from the 7 least significant bits like you suggested. nlguillemot: Updated the name of `More` to make more sense and slightly simplified the logic surrounding it.
	unsigned Count = 0;
	do {			/// The output will be zext-ed to this number of bytes if necessary.
	uint8_t Byte = Value & 0x7f;			unsigned PadTo;

				/// The current number of outputted bytes.
				unsigned Count;
				dsandersUnsubmitted Done Reply Inline Actions Would it make sense to have a RemainingBytes that counts down to zero? Is there a reason to keep PadTo and Count separate? dsanders: Would it make sense to have a RemainingBytes that counts down to zero? Is there a reason to…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions For context, the `Count` and `PadTo` variables are there to match the original implementation of the code. Maybe could use a refactoring. nlguillemot: For context, the `Count` and `PadTo` variables are there to match the original implementation…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions I tried implementing `RemainingBytes` instead of `PadTo`/`Count`, but the problem is that we don't actually know how many bytes are remaining because the algorithm encodes LEB128 bytes one-by-one. `PadTo` is usually equal to 0 which results in having no padding. nlguillemot: I tried implementing `RemainingBytes` instead of `PadTo`/`Count`, but the problem is that we…

				/// The current LEB128 encoded byte that this iterator will return.
				uint8_t CurrByte;

				/// Consumes 7 bits from Value and encodes them as a LEB128 byte in CurrByte.
				void encodeNextByte() {
				CurrByte = Value & 0x7f;
				if (IsSigned) {
				dsandersUnsubmitted Done Reply Inline Actions Is testing this on each byte measurably slower? It seems unlikely but this is called a lot and I notice that the previous code didn't do it. If it does, it would be good if we can have the template instantiation pick one side or the other dsanders: Is testing this on each byte measurably slower? It seems unlikely but this is called a lot and…
				mkitzanUnsubmitted Done Reply Inline Actions `IsSigned` could easily be changed to a template parameter. Could then turn this is into `if constexpr (IsSigned)` (if you can use C++17), or just SFINAE it. Better yet, use [[ https://en.cppreference.com/w/cpp/types/is_signed \| `is_signed` type trait ]] on `ValueT`. mkitzan: `IsSigned` could easily be changed to a template parameter. Could then turn this is into `if…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Last I heard, we have to support C++14, so no `if constexpr` fanciness allowed. :( Using `is_signed` would work for types like `uint64_t` and `int64_t`, but not `APInt`, so we somehow need to figure out how to handle that case. Seems like there's general agreement that it might as well be a template parameter though. nlguillemot: Last I heard, we have to support C++14, so no `if constexpr` fanciness allowed. :( Using…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions `IsSigned` is now a template parameter. nlguillemot: `IsSigned` is now a template parameter.
				int64_t SValue = static_cast<int64_t>(Value);
	// NOTE: this assumes that this signed shift is an arithmetic right shift.			// NOTE: this assumes that this signed shift is an arithmetic right shift.
	Value >>= 7;			SValue >>= 7;
	More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) \|\|			More = !((((SValue == 0) && ((CurrByte & 0x40) == 0)) \|\|
	((Value == -1) && ((Byte & 0x40) != 0))));			((SValue == -1) && ((CurrByte & 0x40) != 0))));
				Value = static_cast<ValueT>(SValue);
				} else {
				// Logical right shift.
				uint64_t UValue = static_cast<uint64_t>(Value);
				UValue >>= 7;
				More = UValue != 0;
				Value = static_cast<ValueT>(UValue);
				}
	Count++;			Count++;
	if (More \|\| Count < PadTo)			if (More \|\| Count < PadTo)
	Byte \|= 0x80; // Mark this byte to show that more bytes will follow.			CurrByte \|= 0x80; // Mark this byte to show that more bytes will follow.
	OS << char(Byte);			}
	} while (More);

	// Pad with 0x80 and emit a terminating byte at the end.			public:
				/// Boilerplate typedefs for C++ iterators.
				///@{
				using iterator_category = std::input_iterator_tag;
				using value_type = uint8_t;
				using difference_type = std::ptrdiff_t;
				using pointer = const uint8_t *;
				using reference = const uint8_t &;
				///@}

				/// Constructs the end-of-input iterator.
				LEB128InputIterator() : IsEnd(true) {}
				mkitzanUnsubmitted Done Reply Inline Actions This constructor leaves other data members uninitialized which may be fine for an end iterator, but is a code smell C.41. mkitzan: This constructor leaves other data members uninitialized which may be fine for an end iterator…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions I don't think this breaks that core guideline, since the invariants are set and the object is totally usable, though there should probably be an assert for `!IsEnd` at the start of `operator++` to enforce the API requirements. On the other hand I agree it might be good to initialize the other members anyways, just to be on the safe side, and to avoid having a copy constructor that reads uninitialized memory. The alternative is to hand-write the copy constructor to avoid copying the other fields if `IsEnd` is set, which sounds ugly and less maintainable. nlguillemot: I don't think this breaks that core guideline, since the invariants are set and the object is…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions All members are now initialized to sensible values to avoid potential undefined behavior. nlguillemot: All members are now initialized to sensible values to avoid potential undefined behavior.

				/// Initializes the iterator that converts \p Value to LEB128.
				///
				/// \param Value The value to convert to LEB128 bytes.
				///
				/// \param IsSigned How to treat the signedness of \p Value. In other words:
				/// If IsSigned is true, then it encodes as SLEB128. If it's false, it encodes
				/// as ULEB128.
				///
				/// \param PadTo ZExt the output to this number of bytes if fewer than this
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Self-review: This comment is wrong, it's not always ZExt. Whether the padding is ZExt or SExt depends on IsSigned. nlguillemot: Self-review: This comment is wrong, it's not always ZExt. Whether the padding is ZExt or SExt…
				/// number of bytes have been outputted.
				explicit LEB128InputIterator(ValueT Value, bool IsSigned, unsigned PadTo)
				: IsEnd(false), Value(std::move(Value)), IsSigned(IsSigned), PadTo(PadTo),
				Count(0) {
				// Initialize the iterator to the first LEB128-encoded byte.
				mkitzanUnsubmitted Done Reply Inline Actions Would be a good idea to initialize `More` in the initializer list just in case something changes in the future and `More` is read before being initially assigned to in `encodeNextByte`. Could also apply to `CurrByte`... mkitzan: Would be a good idea to initialize `More` in the initializer list just in case something…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions All members are now initialized in all cases. nlguillemot: All members are now initialized in all cases.
				encodeNextByte();
				}

				/// Constructs a copy of \p Other.
				LEB128InputIterator(const LEB128InputIterator &Other) = default;

				/// Get the current LEB128-encoded byte.
				///@{
				const uint8_t &operator*() const {
				assert(!IsEnd && "operator*() called on past-the-end LEB128InputIterator");
				return CurrByte;
				}
				const uint8_t operator->() const { return &operator(); }
				///@}

				/// Increment the iterator to the next LEB128-encoded byte.
				///@{
				LEB128InputIterator &operator++() {
				// At first, keep encoding bytes from Value until there is nothing left.
				if (More) {
				encodeNextByte();
				return *this;
				}

				// Pad with 0s or 1s depending on whether we want to zext or sext.
				uint8_t PadValue;
				if (IsSigned && static_cast<int64_t>(Value) < 0)
				PadValue = 0x7f;
				else
				PadValue = 0x00;

				// Output padding as necessary.
	if (Count < PadTo) {			if (Count < PadTo) {
	uint8_t PadValue = Value < 0 ? 0x7f : 0x00;			CurrByte = PadValue;
	for (; Count < PadTo - 1; ++Count)			// Add a continuation bit to signal that there is more padding after this.
	OS << char(PadValue \| 0x80);			if (Count < PadTo - 1)
	OS << char(PadValue);			CurrByte \|= 0x80;
				mkitzanUnsubmitted Done Reply Inline Actions If `PadTo` is 0, may have unintentional underflow since it is `unsigned` (this may be you intention though, and if so it's worth a comment). Seeing it a lot in the other code. Maybe underflow is part of the plan here? mkitzan: If `PadTo` is 0, may have unintentional underflow since it is `unsigned` (this may be you…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions If `PadTo` is 0 then it's impossible for the check above of `if (Count < PadTo)` to pass, so we would never hit this line of code. This code should be safe at least. If there are other cases we should double-check them as well. nlguillemot: If `PadTo` is 0 then it's impossible for the check above of `if (Count < PadTo)` to pass, so we…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Avoided any potential underflow by switching the logic to `if (Count + 1 < PadTo)` nlguillemot: Avoided any potential underflow by switching the logic to `if (Count + 1 < PadTo)`
	Count++;			Count++;
				return *this;
				}

				dsandersUnsubmitted Done Reply Inline Actions I think this belongs inside encodeNextByte(). This operator should essentially be something like: assert(CurrByte.hasValue() && "operator++() called on past-the-end LEB128InputIterator"); CurrByte = encodeNextByte(); return this; where CurrByte is Optional<uint8_t>. As noted in another comment, I think special-casing the padding bytes isn't really needed. dsanders: I think this belongs inside encodeNextByte(). This operator should essentially be something…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions Moved the logic inside the core function and now the operator looks like what you described. nlguillemot: Moved the logic inside the core function and now the operator looks like what you described.
				// Nothing left to output, so we've reached the end.
				assert(!IsEnd && "operator++() called on past-the-end LEB128InputIterator");
				IsEnd = true;
				return *this;
				}
				LEB128InputIterator operator++(int) {
				LEB128InputIterator Prev = *this;
				operator++();
				return Prev;
				}
				///@}

				/// Checks whether both iterators are equal. Two iterators are equal if both
				/// of them are end-of-input iterators or both of them would generate the same
				/// sequence of outputs.
				///@{
				bool operator==(const LEB128InputIterator &Other) const {
				// Both are end-of-input iterators, so they compare the same.
				if (IsEnd && Other.IsEnd)
				return true;

				dsandersUnsubmitted Done Reply Inline Actions I see no harm in allowing two past-the-end iterators from different sequences to be equal but I wonder if it's necessary/useful. With the ULEBifier object it could give you an end iterator and then this operator would be a plain equality comparison. dsanders: I see no harm in allowing two past-the-end iterators from different sequences to be equal but I…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions The same design issue also happens with existing std input iterators like `std::istream_iterator`, so I think it's acceptable even if slightly odd. nlguillemot: The same design issue also happens with existing std input iterators like `std…
				nlguillemotAuthorUnsubmitted Done Reply Inline Actions After messing with it for a while I think it's simplest to keep it this way. Mainly because it simplifies the logic under it, since the member-by-member comparison doesn't have to worry about comparing the members of end iterators, or comparing the members of an end iterator with the members of a non-end iterator. nlguillemot: After messing with it for a while I think it's simplest to keep it this way. Mainly because it…
				// Otherwise, consider two iterators equal if they would generate the same
				// sequence of bytes.
				return IsEnd == Other.IsEnd && Value == Other.Value &&
				IsSigned == Other.IsSigned && More == Other.More &&
				PadTo == Other.PadTo && Count == Other.Count &&
				CurrByte == Other.CurrByte;
	}			}
	return Count;			bool operator!=(const LEB128InputIterator &Other) const {
				return !operator==(Other);
	}			}
				///@}
				};

	/// Utility function to encode a SLEB128 value to a buffer. Returns			/// Utility function to encode a SLEB128 or ULEB128 value to a buffer. Returns
	/// the length in bytes of the encoded value.			/// the length in bytes of the encoded value.
	inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) {			template <class ValueT>
				unsigned encodeLEB128(const ValueT &Value, bool IsSigned, uint8_t *p,
				unsigned PadTo = 0) {
	uint8_t *orig_p = p;			uint8_t *orig_p = p;
	unsigned Count = 0;			p = std::copy(LEB128InputIterator<ValueT>(Value, IsSigned, PadTo),
	bool More;			LEB128InputIterator<ValueT>(), p);
	do {
	uint8_t Byte = Value & 0x7f;
	// NOTE: this assumes that this signed shift is an arithmetic right shift.
	Value >>= 7;
	More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) \|\|
	((Value == -1) && ((Byte & 0x40) != 0))));
	Count++;
	if (More \|\| Count < PadTo)
	Byte \|= 0x80; // Mark this byte to show that more bytes will follow.
	*p++ = Byte;
	} while (More);

	// Pad with 0x80 and emit a terminating byte at the end.
	if (Count < PadTo) {
	uint8_t PadValue = Value < 0 ? 0x7f : 0x00;
	for (; Count < PadTo - 1; ++Count)
	*p++ = (PadValue \| 0x80);
	*p++ = PadValue;
	}
	return (unsigned)(p - orig_p);			return (unsigned)(p - orig_p);
	}			}

	/// Utility function to encode a ULEB128 value to an output stream. Returns			/// Utility function to encode a SLEB128 or ULEB128 value to an output stream.
				/// Returns the length in bytes of the encoded value.
				template <class ValueT>
				inline unsigned encodeLEB128(const ValueT &Value, bool IsSigned,
				raw_ostream &OS, unsigned PadTo = 0) {
				uint64_t TellBefore = OS.tell();
				std::copy(LEB128InputIterator<ValueT>(Value, IsSigned, PadTo),
				LEB128InputIterator<ValueT>(), raw_ostream_iterator<uint8_t>(OS));
				return (unsigned)(OS.tell() - TellBefore);
				}

				/// Utility function to encode a SLEB128 value to an output stream. Returns
	/// the length in bytes of the encoded value.			/// the length in bytes of the encoded value.
	inline unsigned encodeULEB128(uint64_t Value, raw_ostream &OS,			inline unsigned encodeSLEB128(int64_t Value, raw_ostream &OS,
	unsigned PadTo = 0) {			unsigned PadTo = 0) {
	unsigned Count = 0;			return encodeLEB128(Value, /* IsSigned */ true, OS, PadTo);
	do {
	uint8_t Byte = Value & 0x7f;
	Value >>= 7;
	Count++;
	if (Value != 0 \|\| Count < PadTo)
	Byte \|= 0x80; // Mark this byte to show that more bytes will follow.
	OS << char(Byte);
	} while (Value != 0);

	// Pad with 0x80 and emit a null byte at the end.
	if (Count < PadTo) {
	for (; Count < PadTo - 1; ++Count)
	OS << '\x80';
	OS << '\x00';
	Count++;
	}			}
	return Count;
				/// Utility function to encode a SLEB128 value to a buffer. Returns
				/// the length in bytes of the encoded value.
				inline unsigned encodeSLEB128(int64_t Value, uint8_t *p, unsigned PadTo = 0) {
				return encodeLEB128(Value, /* IsSigned */ true, p, PadTo);
	}			}

	/// Utility function to encode a ULEB128 value to a buffer. Returns			/// Utility function to encode a ULEB128 value to an output stream. Returns
	/// the length in bytes of the encoded value.			/// the length in bytes of the encoded value.
	inline unsigned encodeULEB128(uint64_t Value, uint8_t *p,			inline unsigned encodeULEB128(uint64_t Value, raw_ostream &OS,
	unsigned PadTo = 0) {			unsigned PadTo = 0) {
	uint8_t *orig_p = p;			return encodeLEB128(Value, /* IsSigned */ false, OS, PadTo);
	unsigned Count = 0;
	do {
	uint8_t Byte = Value & 0x7f;
	Value >>= 7;
	Count++;
	if (Value != 0 \|\| Count < PadTo)
	Byte \|= 0x80; // Mark this byte to show that more bytes will follow.
	*p++ = Byte;
	} while (Value != 0);

	// Pad with 0x80 and emit a null byte at the end.
	if (Count < PadTo) {
	for (; Count < PadTo - 1; ++Count)
	*p++ = '\x80';
	*p++ = '\x00';
	}			}

	return (unsigned)(p - orig_p);			/// Utility function to encode a ULEB128 value to a buffer. Returns
				/// the length in bytes of the encoded value.
				inline unsigned encodeULEB128(uint64_t Value, uint8_t *p, unsigned PadTo = 0) {
				return encodeLEB128(Value, /* IsSigned */ false, p, PadTo);
	}			}

	/// Utility function to decode a ULEB128 value.			/// Utility function to decode a ULEB128 value.
	inline uint64_t decodeULEB128(const uint8_t p, unsigned n = nullptr,			inline uint64_t decodeULEB128(const uint8_t p, unsigned n = nullptr,
	const uint8_t *end = nullptr,			const uint8_t *end = nullptr,
	const char **error = nullptr) {			const char **error = nullptr) {
	const uint8_t *orig_p = p;			const uint8_t *orig_p = p;
	uint64_t Value = 0;			uint64_t Value = 0;
	▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines