diff --git a/clang/docs/DataFlowSanitizer.rst b/clang/docs/DataFlowSanitizer.rst --- a/clang/docs/DataFlowSanitizer.rst +++ b/clang/docs/DataFlowSanitizer.rst @@ -140,59 +140,14 @@ Example ======= +DataFlowSanitizer supports up to 8 labels, to achieve low CPU and code +size overhead. Base labels are simply 8-bit unsigned integers that are +powers of 2 (i.e. 1, 2, 4, 8, ..., 128), and union labels are created +by ORing base labels. + The following program demonstrates label propagation by checking that the correct labels are propagated. -.. code-block:: c++ - - #include - #include - - int main(void) { - int i = 1; - dfsan_label i_label = dfsan_create_label("i", 0); - dfsan_set_label(i_label, &i, sizeof(i)); - - int j = 2; - dfsan_label j_label = dfsan_create_label("j", 0); - dfsan_set_label(j_label, &j, sizeof(j)); - - int k = 3; - dfsan_label k_label = dfsan_create_label("k", 0); - dfsan_set_label(k_label, &k, sizeof(k)); - - dfsan_label ij_label = dfsan_get_label(i + j); - assert(dfsan_has_label(ij_label, i_label)); - assert(dfsan_has_label(ij_label, j_label)); - assert(!dfsan_has_label(ij_label, k_label)); - - dfsan_label ijk_label = dfsan_get_label(i + j + k); - assert(dfsan_has_label(ijk_label, i_label)); - assert(dfsan_has_label(ijk_label, j_label)); - assert(dfsan_has_label(ijk_label, k_label)); - - return 0; - } - -fast16labels mode -================= - -If you need 16 or fewer labels, you can use fast16labels instrumentation for -less CPU and code size overhead. To use fast16labels instrumentation, you'll -need to specify `-fsanitize=dataflow -mllvm -dfsan-fast-16-labels` in your -compile and link commands and use a modified API for creating and managing -labels. - -In fast16labels mode, base labels are simply 16-bit unsigned integers that are -powers of 2 (i.e. 1, 2, 4, 8, ..., 32768), and union labels are created by ORing -base labels. In this mode DFSan does not manage any label metadata, so the -functions `dfsan_create_label`, `dfsan_union`, `dfsan_get_label_info`, -`dfsan_has_label`, `dfsan_has_label_with_desc`, `dfsan_get_label_count`, and -`dfsan_dump_labels` are unsupported. Instead of using them, the user should -maintain any necessary metadata about base labels themselves. - -For example: - .. code-block:: c++ #include @@ -216,6 +171,11 @@ assert(!(ij_label & k_label)); // ij_label doesn't have k_label assert(ij_label == 3); // Verifies all of the above + // Or, equivalently: + assert(dfsan_has_label(ij_label, i_label)); + assert(dfsan_has_label(ij_label, j_label)); + assert(!dfsan_has_label(ij_label, k_label)); + dfsan_label ijk_label = dfsan_get_label(i + j + k); assert(ijk_label & i_label); // ijk_label has i_label @@ -223,6 +183,11 @@ assert(ijk_label & k_label); // ijk_label has k_label assert(ijk_label == 7); // Verifies all of the above + // Or, equivalently: + assert(dfsan_has_label(ijk_label, i_label)); + assert(dfsan_has_label(ijk_label, j_label)); + assert(dfsan_has_label(ijk_label, k_label)); + return 0; } diff --git a/clang/docs/DataFlowSanitizerDesign.rst b/clang/docs/DataFlowSanitizerDesign.rst --- a/clang/docs/DataFlowSanitizerDesign.rst +++ b/clang/docs/DataFlowSanitizerDesign.rst @@ -12,9 +12,7 @@ a number of taint labels with any data stored in any memory region accessible by the program. The analysis is dynamic, which means that it operates on a running program, and tracks how the labels propagate -through that program. The tool shall support a large (>100) number -of labels, such that programs which operate on large numbers of data -items may be analysed with each data item being tracked separately. +through that program. Use Cases --------- @@ -28,16 +26,13 @@ Interface --------- -A number of functions are provided which will create taint labels, -attach labels to memory regions and extract the set of labels -associated with a specific memory region. These functions are declared -in the header file ``sanitizer/dfsan_interface.h``. +A number of functions are provided which will attach taint labels to +memory regions and extract the set of labels associated with a +specific memory region. These functions are declared in the header +file ``sanitizer/dfsan_interface.h``. .. code-block:: c - /// Creates and returns a base label with the given description and user data. - dfsan_label dfsan_create_label(const char *desc, void *userdata); - /// Sets the label for each address in [addr,addr+size) to \c label. void dfsan_set_label(dfsan_label label, void *addr, size_t size); @@ -53,93 +48,57 @@ /// value. dfsan_label dfsan_get_label(long data); - /// Retrieves a pointer to the dfsan_label_info struct for the given label. - const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label); - /// Returns whether the given label label contains the label elem. int dfsan_has_label(dfsan_label label, dfsan_label elem); - /// If the given label label contains a label with the description desc, returns - /// that label, else returns 0. - dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc); + /// Computes the union of \c l1 and \c l2, resulting in a union label. + dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2); Taint label representation -------------------------- -As stated above, the tool must track a large number of taint -labels. This poses an implementation challenge, as most multiple-label -tainting systems assign one label per bit to shadow storage, and -union taint labels using a bitwise or operation. This will not scale -to clients which use hundreds or thousands of taint labels, as the -label union operation becomes O(n) in the number of supported labels, -and data associated with it will quickly dominate the live variable -set, causing register spills and hampering performance. - -Instead, a low overhead approach is proposed which is best-case O(log\ -:sub:`2` n) during execution. The underlying assumption is that -the required space of label unions is sparse, which is a reasonable -assumption to make given that we are optimizing for the case where -applications mostly copy data from one place to another, without often -invoking the need for an actual union operation. The representation -of a taint label is a 16-bit integer, and new labels are allocated -sequentially from a pool. The label identifier 0 is special, and means -that the data item is unlabelled. - -When a label union operation is requested at a join point (any -arithmetic or logical operation with two or more operands, such as -addition), the code checks whether a union is required, whether the -same union has been requested before, and whether one union label -subsumes the other. If so, it returns the previously allocated union -label. If not, it allocates a new union label from the same pool used -for new labels. - -Specifically, the instrumentation pass will insert code like this -to decide the union label ``lu`` for a pair of labels ``l1`` -and ``l2``: - -.. code-block:: c - - if (l1 == l2) - lu = l1; - else - lu = __dfsan_union(l1, l2); - -The equality comparison is outlined, to provide an early exit in -the common cases where the program is processing unlabelled data, or -where the two data items have the same label. ``__dfsan_union`` is -a runtime library function which performs all other union computation. +We use an 8-bit unsigned integer for the representation of a +label. The label identifier 0 is special, and means that the data item +is unlabelled. This is optimizing for low CPU and code size overhead +of the instrumentation. When a label union operation is requested at a +join point (any arithmetic or logical operation with two or more +operands, such as addition), we can simply OR the two labels in O(1). -Further optimizations are possible, for example if ``l1`` is known -at compile time to be zero (e.g. it is derived from a constant), -``l2`` can be used for ``lu``, and vice versa. +Users are responsible for managing the 8 integer labels (i.e., keeping +track of what labels they have used so far, picking one that is yet +unused, etc). Memory layout and label management ---------------------------------- -The following is the current memory layout for Linux/x86\_64: +The following is the memory layout for Linux/x86\_64: +---------------+---------------+--------------------+ | Start | End | Use | +===============+===============+====================+ | 0x700000008000|0x800000000000 | application memory | +---------------+---------------+--------------------+ -| 0x200200000000|0x700000008000 | unused | +| 0x300000000000|0x700000008000 | unused | ++---------------+---------------+--------------------+ +| 0x200000008000|0x300000000000 | origin | ++---------------+---------------+--------------------+ +| 0x200000000000|0x200000008000 | unused | +---------------+---------------+--------------------+ -| 0x200000000000|0x200200000000 | union table | +| 0x100000008000|0x200000000000 | shadow memory | +---------------+---------------+--------------------+ -| 0x000000010000|0x200000000000 | shadow memory | +| 0x000000010000|0x100000008000 | unused | +---------------+---------------+--------------------+ | 0x000000000000|0x000000010000 | reserved by kernel | +---------------+---------------+--------------------+ -Each byte of application memory corresponds to two bytes of shadow -memory, which are used to store its taint label. As for LLVM SSA +Each byte of application memory corresponds to a single byte of shadow +memory, which is used to store its taint label. As for LLVM SSA registers, we have not found it necessary to associate a label with each byte or bit of data, as some other tools do. Instead, labels are associated directly with registers. Loads will result in a union of -all shadow labels corresponding to bytes loaded (which most of the -time will be short circuited by the initial comparison) and stores will -result in a copy of the label to the shadow of all bytes stored to. +all shadow labels corresponding to bytes loaded, and stores will +result in a copy of the label of the stored value to the shadow of all +bytes stored to. Propagating labels through arguments ------------------------------------ diff --git a/compiler-rt/include/sanitizer/dfsan_interface.h b/compiler-rt/include/sanitizer/dfsan_interface.h --- a/compiler-rt/include/sanitizer/dfsan_interface.h +++ b/compiler-rt/include/sanitizer/dfsan_interface.h @@ -21,34 +21,15 @@ extern "C" { #endif -typedef uint16_t dfsan_label; +typedef uint8_t dfsan_label; typedef uint32_t dfsan_origin; -/// Stores information associated with a specific label identifier. A label -/// may be a base label created using dfsan_create_label, with associated -/// text description and user data, or an automatically created union label, -/// which represents the union of two label identifiers (which may themselves -/// be base or union labels). -struct dfsan_label_info { - // Fields for union labels, set to 0 for base labels. - dfsan_label l1; - dfsan_label l2; - - // Fields for base labels. - const char *desc; - void *userdata; -}; - /// Signature of the callback argument to dfsan_set_write_callback(). typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count); -/// Computes the union of \c l1 and \c l2, possibly creating a union label in -/// the process. +/// Computes the union of \c l1 and \c l2, resulting in a union label. dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2); -/// Creates and returns a base label with the given description and user data. -dfsan_label dfsan_create_label(const char *desc, void *userdata); - /// Sets the label for each address in [addr,addr+size) to \c label. void dfsan_set_label(dfsan_label label, void *addr, size_t size); @@ -73,19 +54,9 @@ /// Retrieves the label associated with the data at the given address. dfsan_label dfsan_read_label(const void *addr, size_t size); -/// Retrieves a pointer to the dfsan_label_info struct for the given label. -const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label); - /// Returns whether the given label label contains the label elem. int dfsan_has_label(dfsan_label label, dfsan_label elem); -/// If the given label label contains a label with the description desc, returns -/// that label, else returns 0. -dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc); - -/// Returns the number of labels allocated. -size_t dfsan_get_label_count(void); - /// Flushes the DFSan shadow, i.e. forgets about all labels currently associated /// with the application memory. Use this call to start over the taint tracking /// within the same process. @@ -99,12 +70,6 @@ /// callback executes. Pass in NULL to remove any callback. void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback); -/// Writes the labels currently used by the program to the given file -/// descriptor. The lines of the output have the following format: -/// -///