diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt --- a/libc/src/stdio/scanf_core/CMakeLists.txt +++ b/libc/src/stdio/scanf_core/CMakeLists.txt @@ -78,9 +78,10 @@ SRCS converter.cpp string_converter.cpp + int_converter.cpp HDRS converter.h - string_converter.h + int_converter.h DEPENDS .reader .core_structs diff --git a/libc/src/stdio/scanf_core/converter.cpp b/libc/src/stdio/scanf_core/converter.cpp --- a/libc/src/stdio/scanf_core/converter.cpp +++ b/libc/src/stdio/scanf_core/converter.cpp @@ -12,6 +12,7 @@ #include "src/stdio/scanf_core/core_structs.h" #include "src/stdio/scanf_core/reader.h" +#include "src/stdio/scanf_core/int_converter.h" #include "src/stdio/scanf_core/string_converter.h" #include @@ -32,16 +33,16 @@ case 'c': case '[': return convert_string(reader, to_conv); - // case 'd': - // case 'i': - // case 'u': - // case 'o': - // case 'x': - // case 'X': - // ret_val = raw_match(reader, " "); - // if (ret_val != READ_OK) - // return ret_val; - // return convert_int(reader, to_conv); + case 'd': + case 'i': + case 'u': + case 'o': + case 'x': + case 'X': + ret_val = raw_match(reader, " "); + if (ret_val != READ_OK) + return ret_val; + return convert_int(reader, to_conv); // #ifndef LLVM_LIBC_SCANF_DISABLE_FLOAT // case 'f': // case 'F': diff --git a/libc/src/stdio/scanf_core/int_converter.h b/libc/src/stdio/scanf_core/int_converter.h new file mode 100644 --- /dev/null +++ b/libc/src/stdio/scanf_core/int_converter.h @@ -0,0 +1,25 @@ +//===-- Int type specifier converter for scanf ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H +#define LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H + +#include "src/stdio/scanf_core/core_structs.h" +#include "src/stdio/scanf_core/reader.h" + +#include + +namespace __llvm_libc { +namespace scanf_core { + +int convert_int(Reader *reader, const FormatSection &to_conv); + +} // namespace scanf_core +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STDIO_SCANF_CORE_INT_CONVERTER_H diff --git a/libc/src/stdio/scanf_core/int_converter.cpp b/libc/src/stdio/scanf_core/int_converter.cpp new file mode 100644 --- /dev/null +++ b/libc/src/stdio/scanf_core/int_converter.cpp @@ -0,0 +1,266 @@ +//===-- Int type specifier converters for scanf -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdio/scanf_core/string_converter.h" + +#include "src/__support/CPP/limits.h" +#include "src/__support/ctype_utils.h" +#include "src/stdio/scanf_core/core_structs.h" +#include "src/stdio/scanf_core/reader.h" + +#include + +namespace __llvm_libc { +namespace scanf_core { + +constexpr char inline to_lower(char a) { return a | 32; } + +constexpr inline int b36_char_to_int(char input) { + if (internal::isdigit(input)) + return input - '0'; + if (internal::isalpha(input)) + return to_lower(input) + 10 - 'a'; + return 0; +} + +void write_with_length(uintmax_t output_val, const FormatSection &to_conv) { + if ((to_conv.flags & NO_WRITE) != 0) { + return; + } + LengthModifier lm = to_conv.length_modifier; + void *output_ptr = to_conv.output_ptr; + switch (lm) { + case (LengthModifier::hh): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::h): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::NONE): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::l): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::ll): + case (LengthModifier::L): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::j): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + case (LengthModifier::z): + *reinterpret_cast(output_ptr) = static_cast(output_val); + break; + case (LengthModifier::t): + *reinterpret_cast(output_ptr) = + static_cast(output_val); + break; + } +} + +// This code is very similar to the code in __support/str_to_integer.h but is +// not quite the same. Here is the list of differences and why they exist: +// 1) This takes a reader and a format section instead of a char* and the base. +// This should be fairly self explanatory. While the char* could be adapted +// to a reader and the base could be calculated ahead of time, the +// semantics are slightly different, specifically a char* can be indexed +// freely (I can read str[2] and then str[0]) whereas a File (which the +// reader may contain) cannot. +// 2) Because this uses a Reader, this function can only unget once. +// This is relevant because scanf specifies it reads the "longest sequence +// of input characters which does not exceed any specified field width and +// which is, or is a prefix of, a matching input sequence." Whereas the +// strtol function accepts "the longest initial subsequence of the input +// string (...) that is of the expected form." This is demonstrated by the +// differences in how they deal with the string "0xZZZ" when parsing as +// hexadecimal. Scanf will read the "0x" as a valid prefix and return 0, +// since it reads the first 'Z', sees that it's not a valid hex digit, and +// reverses one character. The strtol function on the other hand only +// accepts the "0" since that's the longest valid hexadecimal sequence. It +// sees the 'Z' after the "0x" and determines that this is not the prefix +// to a valid hex string. +// 3) This conversion may have a maximum width. +// If a maximum width is specified, this conversion is only allowed to +// accept a certain number of characters. Strtol doesn't have any such +// limitation. +int convert_int(Reader *reader, const FormatSection &to_conv) { + // %d "Matches an optionally signed decimal integer [...] with the value 10 + // for the base argument. The corresponding argument shall be a pointer to + // signed integer." + + // %i "Matches an optionally signed integer [...] with the value 0 for the + // base argument. The corresponding argument shall be a pointer to signed + // integer." + + // %u "Matches an optionally signed decimal integer [...] with the value 10 + // for the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + // %o "Matches an optionally signed octal integer [...] with the value 8 for + // the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + // %x/X "Matches an optionally signed hexadecimal integer [...] with the value + // 16 for the base argument. The corresponding argument shall be a pointer to + // unsigned integer" + + size_t max_width = cpp::numeric_limits::max(); + if (to_conv.max_width > 0) { + max_width = to_conv.max_width; + } + + uintmax_t result = 0; + bool is_number = false; + bool is_signed = false; + int base = 0; + if (to_conv.conv_name == 'i') { + base = 0; + is_signed = true; + } else if (to_conv.conv_name == 'o') { + base = 8; + } else if (to_lower(to_conv.conv_name) == 'x') { + base = 16; + } else if (to_conv.conv_name == 'd') { + base = 10; + is_signed = true; + } else { // conv_name must be 'u' + base = 10; + } + + char cur_char = reader->getc(); + + char result_sign = '+'; + if (cur_char == '+' || cur_char == '-') { + result_sign = cur_char; + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + // If the max width has been hit already, then the return value must be 0 + // since no actual digits of the number have been parsed yet. + write_with_length(0, to_conv); + return MATCHING_FAILURE; + } + } + const bool is_negative = result_sign == '-'; + + // Base of 0 means automatically determine the base. Base of 16 may have a + // prefix of "0x" + if (base == 0 || base == 16) { + // If the first character is 0, then it could be octal or hex. + if (cur_char == '0') { + is_number = true; + + // Read the next character to check. + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + write_with_length(0, to_conv); + return READ_OK; + } + + if (to_lower(cur_char) == 'x') { + // This is a valid hex prefix. + base = 16; + if (max_width > 1) { + --max_width; + cur_char = reader->getc(); + } else { + write_with_length(0, to_conv); + return READ_OK; + } + + } else { + if (base == 0) { + base = 8; + } + } + } else if (base == 0) { + if (internal::isdigit(cur_char)) { + // If the first character is a different number, then it's 10. + base = 10; + } else { + // If the first character isn't a valid digit, then there are no valid + // digits at all. The number is 0. + reader->ungetc(cur_char); + write_with_length(0, to_conv); + return MATCHING_FAILURE; + } + } + } + + constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits::max(); + constexpr uintmax_t SIGNED_MAX = + static_cast(cpp::numeric_limits::max()); + constexpr uintmax_t NEGATIVE_SIGNED_MAX = + static_cast(cpp::numeric_limits::max()) + 1; + + const uintmax_t MAX = + (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX) + : UNSIGNED_MAX); + + const uintmax_t max_div_by_base = MAX / base; + + if (internal::isalnum(cur_char) && b36_char_to_int(cur_char) < base) { + is_number = true; + } + + bool has_overflow = false; + size_t i = 0; + for (; i < max_width && internal::isalnum(cur_char) && + b36_char_to_int(cur_char) < base; + ++i, cur_char = reader->getc()) { + + uintmax_t cur_digit = b36_char_to_int(cur_char); + + if (result == MAX) { + has_overflow = true; + continue; + } else if (result > max_div_by_base) { + result = MAX; + has_overflow = true; + } else { + result = result * base; + } + + if (result > MAX - cur_digit) { + result = MAX; + has_overflow = true; + } else { + result = result + cur_digit; + } + } + + // We always read one more character than will be used, so we have to put the + // last one back. + reader->ungetc(cur_char); + + if (has_overflow) { + write_with_length(MAX, to_conv); + } else { + if (is_negative) + result = -result; + + write_with_length(result, to_conv); + } + + if (!is_number) + return MATCHING_FAILURE; + return READ_OK; +} + +} // namespace scanf_core +} // namespace __llvm_libc diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp --- a/libc/test/src/stdio/sscanf_test.cpp +++ b/libc/test/src/stdio/sscanf_test.cpp @@ -6,8 +6,11 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/CPP/limits.h" #include "src/stdio/sscanf.h" +#include // For EOF + #include "utils/UnitTest/Test.h" TEST(LlvmLibcSScanfTest, SimpleStringConv) { @@ -28,3 +31,200 @@ ASSERT_STREQ(buffer, "abc"); ASSERT_STREQ(buffer2, "123"); } + +TEST(LlvmLibcSScanfTest, IntConvSimple) { + int ret_val; + int result = 0; + ret_val = __llvm_libc::sscanf("123", "%d", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 123); + + ret_val = __llvm_libc::sscanf("456", "%i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 456); + + ret_val = __llvm_libc::sscanf("789", "%x", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0x789); + + ret_val = __llvm_libc::sscanf("012", "%o", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 012); + + ret_val = __llvm_libc::sscanf("345", "%u", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 345); + + ret_val = __llvm_libc::sscanf("Not an integer", "%d", &result); + EXPECT_EQ(ret_val, 0); +} + +TEST(LlvmLibcSScanfTest, IntConvLengthModifier) { + int ret_val; + uintmax_t max_result = 0; + int int_result = 0; + char char_result = 0; + + ret_val = __llvm_libc::sscanf("123", "%ju", &max_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(max_result, uintmax_t(123)); + + // Check overflow handling + ret_val = __llvm_libc::sscanf("999999999999999999999999999999999999", "%ju", + &max_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits::max()); + + // Because this is unsigned, any out of range value should return the maximum, + // even with a negative sign. + ret_val = __llvm_libc::sscanf("-999999999999999999999999999999999999", "%ju", + &max_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits::max()); + + ret_val = __llvm_libc::sscanf("-18446744073709551616", "%ju", &max_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(max_result, __llvm_libc::cpp::numeric_limits::max()); + + // But any number below the maximum should have the - sign applied. + ret_val = __llvm_libc::sscanf("-1", "%ju", &max_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(max_result, uintmax_t(-1)); + + ret_val = __llvm_libc::sscanf("-1", "%u", &int_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(int_result, -1); + + max_result = 0xff00ff00ff00ff00; + char_result = 0x6f; + + // Overflows for sizes larger than the maximum are handled by casting. + ret_val = __llvm_libc::sscanf("8589967360", "%d", &int_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(int_result, int(8589967360)); // 2^33 + 2^15 + + // Check that the adjacent values weren't touched by the overflow. + ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00)); + ASSERT_EQ(char_result, char(0x6f)); + + ret_val = __llvm_libc::sscanf("-8589967360", "%d", &int_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(int_result, int(-8589967360)); + ASSERT_EQ(max_result, uintmax_t(0xff00ff00ff00ff00)); + ASSERT_EQ(char_result, char(0x6f)); + + ret_val = __llvm_libc::sscanf("25", "%hhd", &char_result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(char_result, char(25)); +} + +TEST(LlvmLibcSScanfTest, IntConvBaseSelection) { + int ret_val; + int result = 0; + ret_val = __llvm_libc::sscanf("0xabc123", "%i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0xabc123); + + ret_val = __llvm_libc::sscanf("0456", "%i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0456); + + ret_val = __llvm_libc::sscanf("0999", "%i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("123abc456", "%i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 123); +} + +TEST(LlvmLibcSScanfTest, IntConvMaxLengthTests) { + int ret_val; + int result = 0; + + ret_val = __llvm_libc::sscanf("12", "%1d", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 1); + + ret_val = __llvm_libc::sscanf("-1", "%1d", &result); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("+1", "%1d", &result); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("01", "%1d", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("01", "%1i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("0x1", "%2i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("-0x1", "%3i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("-0x123", "%4i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, -1); + + ret_val = __llvm_libc::sscanf("123456789", "%5i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 12345); + + ret_val = __llvm_libc::sscanf("123456789", "%10i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 123456789); +} + +TEST(LlvmLibcSScanfTest, IntConvNoWriteTests) { + int ret_val; + // Result shouldn't be used by these tests, but it's safer to have it and + // check it. + int result = 0; + ret_val = __llvm_libc::sscanf("-1", "%*1d", &result); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("01", "%*1i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("0x1", "%*2i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("a", "%*i", &result); + EXPECT_EQ(ret_val, 0); + EXPECT_EQ(result, 0); + + ret_val = __llvm_libc::sscanf("123", "%*i", &result); + EXPECT_EQ(ret_val, 1); + EXPECT_EQ(result, 0); +} + +TEST(LlvmLibcSScanfTest, CombinedConv) { + int ret_val; + int result = 0; + char buffer[10]; + ret_val = __llvm_libc::sscanf("123abc", "%i%s", &result, buffer); + EXPECT_EQ(ret_val, 2); + EXPECT_EQ(result, 123); + ASSERT_STREQ(buffer, "abc"); + + ret_val = __llvm_libc::sscanf("0xZZZ", "%i%s", &result, buffer); + EXPECT_EQ(ret_val, 2); + EXPECT_EQ(result, 0); + ASSERT_STREQ(buffer, "ZZZ"); + + ret_val = __llvm_libc::sscanf("0xZZZ", "%X%s", &result, buffer); + EXPECT_EQ(ret_val, 2); + EXPECT_EQ(result, 0); + ASSERT_STREQ(buffer, "ZZZ"); +}