diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt --- a/libc/src/stdio/CMakeLists.txt +++ b/libc/src/stdio/CMakeLists.txt @@ -8,3 +8,11 @@ libc.src.threads.mtx_lock libc.src.threads.mtx_unlock ) + + +add_header_library( + printf_parser + HDRS + printf_parser.h + printf_format_struct.h +) diff --git a/libc/src/stdio/printf_format_struct.h b/libc/src/stdio/printf_format_struct.h new file mode 100644 --- /dev/null +++ b/libc/src/stdio/printf_format_struct.h @@ -0,0 +1,55 @@ +//===-- Definition of the common format struct ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_FORMAT_STRUCT_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_FORMAT_STRUCT_H + +#include + +namespace __llvm_libc { +namespace internal { + +enum class LengthModifier { hh, h, l, ll, j, z, t, L, none }; + +struct PrintfFormat { + bool left_justified = false; + bool force_sign = false; + bool space_prefix = false; + bool alt_form = false; + bool leading_zeroes = false; + + LengthModifier length_modifier; + + // a 0 index means "next sequentially", anything else is an absolute index. + size_t width_index = 0; + size_t precision_index = 0; + size_t conv_index = 0; + + // This determines if width and precision are actually variables which can be + // found at the indexes width_index and precision_index respectively, or if + // they are constants, found in min_width and precision respectively. + bool width_is_var = false; + bool precision_is_var = false; + // conv is always assumed to be a variable, it's the data being converted. + + int min_width = 0; + int precision = -1; + char conv_name; +}; + +struct PrintfToken { + PrintfFormat format; + const char *__restrict raw_string; + size_t raw_len; + + bool has_conv; +}; + +} // namespace internal +} // namespace __llvm_libc +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_FORMAT_STRUCT_H diff --git a/libc/src/stdio/printf_parser.h b/libc/src/stdio/printf_parser.h new file mode 100644 --- /dev/null +++ b/libc/src/stdio/printf_parser.h @@ -0,0 +1,353 @@ +//===-- Definition of the parser for printf ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_PRINTF_PARSER_H +#define LLVM_LIBC_SRC_STDIO_PRINTF_PARSER_H + +#include "src/__support/ctype_utils.h" +#include "src/__support/str_to_integer.h" +#include "src/stdio/printf_format_struct.h" +#include "src/string/memory_utils/memcpy_implementations.h" +#include "src/string/memory_utils/memset_implementations.h" +#include "src/string/string_utils.h" + +#include +#include + +#include // for malloc + +namespace __llvm_libc { +namespace internal { + +class PrintfParser { + const char *__restrict str; + size_t cur_pos = 0; + + va_list *vlist; + + // TODO(michaelrj): implement a resizable vector class, then convert + // token_array and arg_sizes into vectors. + size_t token_array_size = 20; + size_t token_array_write = 0; + size_t token_array_read = 0; + PrintfToken *token_array; + + void write_token(PrintfToken new_token) { + if (token_array_write >= token_array_size) { + token_array_size = token_array_size + 5; + token_array = static_cast( + realloc(token_array, token_array_size * sizeof(PrintfToken))); + } + token_array[token_array_write] = new_token; + ++token_array_write; + } + + size_t arg_sizes_len = 0; + bool *arg_sizes; + void write_arg_size(size_t index, bool is_64_bits) { + if (index == 0) + return; + if (arg_sizes_len == 0) { + arg_sizes_len = 10; + arg_sizes = static_cast(calloc(arg_sizes_len, sizeof(bool))); + } + if (index >= arg_sizes_len) { + arg_sizes_len = index + 5; + arg_sizes = + static_cast(realloc(arg_sizes, arg_sizes_len * sizeof(bool))); + } + arg_sizes[index] = is_64_bits; + } + + // This function parses the index of a value inside a format string. It + // assumes that str[cur_pos] points to character after a '%' or '*', and + // returns 0 if there is no closing $, or if it finds no number. If it finds a + // number, it will move cur_pos past the end of the $, else it will not move + // cur_pos. + size_t parse_index() { + if (isdigit(str[cur_pos])) { + char *int_end; + size_t index = strtointeger(str + cur_pos, &int_end, 10); + if (int_end[0] != '$') + return 0; + cur_pos = 1 + int_end - str; + return index; + } + return 0; + } + + // this function assumes that str[cur_pos] is inside a format specifier, and + // parses any flags it finds, advancing cur_pos, and writing to the + // PrintfFormat passed to it. + void parse_flags(PrintfFormat &cur_format) { + bool found_flag = true; + while (found_flag) { + switch (str[cur_pos]) { + case '-': + cur_format.left_justified = true; + break; + case '+': + cur_format.force_sign = true; + break; + case ' ': + cur_format.space_prefix = true; + break; + case '#': + cur_format.alt_form = true; + break; + case '0': + cur_format.leading_zeroes = true; + break; + default: + found_flag = false; + } + if (found_flag) + ++cur_pos; + } + } + + // this function assumes that str[cur_pos] is inside a format specifier, and + // parses any length modifiers it finds, advancing cur_pos, and writing to the + // PrintfFormat passed to it. + void parse_length_modifier(PrintfFormat &cur_format) { + switch (str[cur_pos]) { + case ('l'): + if (str[cur_pos + 1] == 'l') { + cur_format.length_modifier = LengthModifier::ll; + cur_pos += 2; + } else { + cur_format.length_modifier = LengthModifier::l; + ++cur_pos; + } + break; + case ('h'): + if (str[cur_pos + 1] == 'h') { + cur_format.length_modifier = LengthModifier::hh; + cur_pos += 2; + } else { + cur_format.length_modifier = LengthModifier::h; + ++cur_pos; + } + break; + case ('L'): + cur_format.length_modifier = LengthModifier::L; + ++cur_pos; + break; + case ('j'): + cur_format.length_modifier = LengthModifier::j; + ++cur_pos; + break; + case ('z'): + cur_format.length_modifier = LengthModifier::z; + ++cur_pos; + break; + case ('t'): + cur_format.length_modifier = LengthModifier::t; + ++cur_pos; + break; + } + } + + static bool is_64_bits(char conv_name, LengthModifier length_modifier) { + bool conv_is_64_bits; + switch (conv_name) { + // integers + case ('d'): + case ('i'): + case ('o'): + case ('u'): + case ('x'): + case ('X'): + if (length_modifier == LengthModifier::h || + length_modifier == LengthModifier::hh || + length_modifier == LengthModifier::none) + conv_is_64_bits = false; + else + conv_is_64_bits = true; + break; + + // floats + case ('f'): + case ('F'): + case ('e'): + case ('E'): + case ('g'): + case ('G'): + case ('a'): + case ('A'): + // string + case ('s'): + // pointer + case ('p'): + // int pointer + case ('n'): + conv_is_64_bits = true; + break; + + // char + case ('c'): + conv_is_64_bits = false; + break; + + default: + conv_is_64_bits = false; + break; + } + return conv_is_64_bits; + } + + template T get_arg_next() { return va_arg(*vlist, T); } + + template T get_arg_index(size_t index) { + va_list vlist_copy; + va_copy(vlist_copy, *vlist); + + // i starts at 1 because the args are indexed from 1 + for (size_t i = 1; i < index; ++i) { + if (arg_sizes[i]) + va_arg(vlist_copy, uint64_t); + else + va_arg(vlist_copy, uint32_t); + } + T arg = va_arg(vlist_copy, T); + va_end(vlist_copy); + return arg; + } + + void parse_str() { + size_t prev_token_end = 0; + + while (str[cur_pos]) { + if (str[cur_pos] == '%') { + if (cur_pos > prev_token_end) { + PrintfToken raw_token; + raw_token.has_conv = false; + raw_token.raw_string = str + prev_token_end; + raw_token.raw_len = cur_pos - prev_token_end; + write_token(raw_token); + } + + prev_token_end = cur_pos; + + PrintfToken new_token; + new_token.raw_string = str + cur_pos; + new_token.has_conv = true; + + ++cur_pos; // advance past the % sign + + new_token.format.conv_index = parse_index(); + parse_flags(new_token.format); + + // handle width + if (isdigit(str[cur_pos]) || (str[cur_pos] == '*')) { + if (str[cur_pos] == '*') { + ++cur_pos; + new_token.format.width_is_var = true; + size_t temp_index = parse_index(); + new_token.format.width_index = temp_index; + write_arg_size(temp_index, + false); // width is of type int. + } else { + char *int_end; + new_token.format.width_is_var = false; + new_token.format.min_width = + strtointeger(str + cur_pos, &int_end, 10); + cur_pos = int_end - str; + } + } + + // handle precision + if (str[cur_pos] == '.') { + // if there's just a . with no number, it's assumed that the + // precision is set to 0. + new_token.format.precision = 0; + + if (isdigit(str[cur_pos]) || (str[cur_pos] == '*')) { + if (str[cur_pos] == '*') { + ++cur_pos; + new_token.format.precision_is_var = true; + size_t temp_index = parse_index(); + new_token.format.precision_index = temp_index; + write_arg_size(temp_index, + false); // precision is of type int. + } else { + char *int_end; + new_token.format.precision_is_var = false; + new_token.format.precision = + strtointeger(str + cur_pos, &int_end, 10); + cur_pos = int_end - str; + } + } + } + + parse_length_modifier(new_token.format); + + new_token.format.conv_name = str[cur_pos]; + ++cur_pos; // consume the conv_name + + if (new_token.format.conv_index != 0) { + write_arg_size( + new_token.format.conv_index, + is_64_bits(str[cur_pos], new_token.format.length_modifier)); + } + + new_token.raw_len = cur_pos - prev_token_end; + write_token(new_token); + prev_token_end = cur_pos; + + } else + ++cur_pos; + } + if (prev_token_end < cur_pos) { + PrintfToken final_token; + final_token.has_conv = false; + final_token.raw_string = str + prev_token_end; + final_token.raw_len = cur_pos - prev_token_end; + write_token(final_token); + } + } + +public: + PrintfParser(const char *__restrict input_str, va_list *in_vlist) + : str{input_str}, vlist{in_vlist} { + token_array = static_cast( + malloc(token_array_size * sizeof(PrintfToken))); + parse_str(); + } + + ~PrintfParser() { + free(token_array); + if (arg_sizes_len > 0) + free(arg_sizes); + } + + PrintfToken get_next_token() { + PrintfToken cur_token; + if (token_array_read < token_array_write) { + cur_token = token_array[token_array_read]; + ++token_array_read; + } else { + // a 0 raw_len represents the last token; + cur_token.raw_string = nullptr; + cur_token.raw_len = 0; + cur_token.has_conv = false; + } + return cur_token; + } + + template T get_arg(size_t index) { + if (index == 0) + return get_arg_next(); + else + return get_arg_index(index); + } +}; + +} // namespace internal +} // namespace __llvm_libc +#endif // LLVM_LIBC_SRC_STDIO_PRINTF_PARSER_H diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -9,3 +9,13 @@ DEPENDS libc.src.stdio.fwrite ) + +add_libc_unittest( + printf_parser_test + SUITE + libc_stdio_unittests + SRCS + printf_parser_test.cpp + DEPENDS + libc.src.stdio.printf_parser +) diff --git a/libc/test/src/stdio/printf_parser_test.cpp b/libc/test/src/stdio/printf_parser_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/stdio/printf_parser_test.cpp @@ -0,0 +1,67 @@ +//===-- Unittests for printf_parser ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdio/printf_parser.h" + +#include "utils/UnitTest/Test.h" + +TEST(LlvmLibcPrintfParserTest, InitTest) { + const char *test_str = "abcDEF123"; + va_list empty; + __llvm_libc::internal::PrintfParser(test_str, &empty); +} + +TEST(LlvmLibcPrintfParserTest, RawTokenTest) { + const char *test_str = "abcDEF123"; + va_list empty; + __llvm_libc::internal::PrintfParser parser = + __llvm_libc::internal::PrintfParser(test_str, &empty); + __llvm_libc::internal::PrintfToken token = parser.get_next_token(); + + EXPECT_FALSE(token.has_conv); + EXPECT_EQ(token.raw_len, size_t(9)); + ASSERT_STREQ(token.raw_string, test_str); +} + +TEST(LlvmLibcPrintfParserTest, BasicTokens) { + const char *test_str = "abc %s DEF"; + constexpr size_t array_size = 10; + __llvm_libc::internal::PrintfToken token_array[array_size]; + size_t cur_token; + va_list empty; + __llvm_libc::internal::PrintfParser parser = + __llvm_libc::internal::PrintfParser(test_str, &empty); + for (cur_token = 0; cur_token < array_size; ++cur_token) { + __llvm_libc::internal::PrintfToken new_token = parser.get_next_token(); + if (new_token.raw_len) { + token_array[cur_token] = new_token; + } else { + break; + } + } + + ASSERT_EQ(cur_token, size_t(3)); + + ASSERT_FALSE(token_array[0].has_conv); + ASSERT_EQ(token_array[0].raw_len, size_t(4)); + // This is intentionally a pointer comparison, and not a string comparison. + ASSERT_EQ(token_array[0].raw_string, test_str); + + ASSERT_TRUE(token_array[1].has_conv); + + ASSERT_EQ(token_array[1].format.conv_name, 's'); + + ASSERT_EQ(token_array[1].raw_len, size_t(2)); + // This is intentionally a pointer comparison, and not a string comparison. + ASSERT_EQ(token_array[1].raw_string, test_str + 4); + + ASSERT_FALSE(token_array[2].has_conv); + ASSERT_EQ(token_array[2].raw_len, size_t(4)); + // This is intentionally a pointer comparison, and not a string comparison. + ASSERT_EQ(token_array[2].raw_string, test_str + 6); +}