libc/src/stdio/scanf_core/int_converter.cpp

   1 //===-- Int type specifier converters for scanf -----------------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #include "src/stdio/scanf_core/int_converter.h"
  10
  11 #include "src/__support/CPP/limits.h"
  12 #include "src/__support/ctype_utils.h"
  13 #include "src/__support/macros/config.h"
  14 #include "src/stdio/scanf_core/converter_utils.h"
  15 #include "src/stdio/scanf_core/core_structs.h"
  16 #include "src/stdio/scanf_core/reader.h"
  17
  18 #include <stddef.h>
  19
  20 namespace LIBC_NAMESPACE_DECL {
  21 namespace scanf_core {
  22
  23 // This code is very similar to the code in __support/str_to_integer.h but is
  24 // not quite the same. Here is the list of differences and why they exist:
  25 //  1) This takes a reader and a format section instead of a char* and the base.
  26 //      This should be fairly self explanatory. While the char* could be adapted
  27 //      to a reader and the base could be calculated ahead of time, the
  28 //      semantics are slightly different, specifically a char* can be indexed
  29 //      freely (I can read str[2] and then str[0]) whereas a File (which the
  30 //      reader may contain) cannot.
  31 //  2) Because this uses a Reader, this function can only unget once.
  32 //      This is relevant because scanf specifies it reads the "longest sequence
  33 //      of input characters which does not exceed any specified field width and
  34 //      which is, or is a prefix of, a matching input sequence." Whereas the
  35 //      strtol function accepts "the longest initial subsequence of the input
  36 //      string (...) that is of the expected form." This is demonstrated by the
  37 //      differences in how they deal with the string "0xZZZ" when parsing as
  38 //      hexadecimal. Scanf will read the "0x" as a valid prefix and return 0,
  39 //      since it reads the first 'Z', sees that it's not a valid hex digit, and
  40 //      reverses one character. The strtol function on the other hand only
  41 //      accepts the "0" since that's the longest valid hexadecimal sequence. It
  42 //      sees the 'Z' after the "0x" and determines that this is not the prefix
  43 //      to a valid hex string.
  44 //  3) This conversion may have a maximum width.
  45 //      If a maximum width is specified, this conversion is only allowed to
  46 //      accept a certain number of characters. Strtol doesn't have any such
  47 //      limitation.
  48 int convert_int(Reader *reader, const FormatSection &to_conv) {
  49   // %d "Matches an optionally signed decimal integer [...] with the value 10
  50   // for the base argument. The corresponding argument shall be a pointer to
  51   // signed integer."
  52
  53   // %i "Matches an optionally signed integer [...] with the value 0 for the
  54   // base argument. The corresponding argument shall be a pointer to signed
  55   // integer."
  56
  57   // %u "Matches an optionally signed decimal integer [...] with the value 10
  58   // for the base argument. The corresponding argument shall be a pointer to
  59   // unsigned integer"
  60
  61   // %o "Matches an optionally signed octal integer [...] with the value 8 for
  62   // the base argument. The corresponding argument shall be a pointer to
  63   // unsigned integer"
  64
  65   // %x/X "Matches an optionally signed hexadecimal integer [...] with the value
  66   // 16 for the base argument. The corresponding argument shall be a pointer to
  67   // unsigned integer"
  68
  69   size_t max_width = cpp::numeric_limits<size_t>::max();
  70   if (to_conv.max_width > 0) {
  71     max_width = to_conv.max_width;
  72   }
  73
  74   uintmax_t result = 0;
  75   bool is_number = false;
  76   bool is_signed = false;
  77   int base = 0;
  78   if (to_conv.conv_name == 'i') {
  79     base = 0;
  80     is_signed = true;
  81   } else if (to_conv.conv_name == 'o') {
  82     base = 8;
  83   } else if (internal::tolower(to_conv.conv_name) == 'x' ||
  84              to_conv.conv_name == 'p') {
  85     base = 16;
  86   } else if (to_conv.conv_name == 'd') {
  87     base = 10;
  88     is_signed = true;
  89   } else { // conv_name must be 'u'
  90     base = 10;
  91   }
  92
  93   char cur_char = reader->getc();
  94
  95   char result_sign = '+';
  96   if (cur_char == '+' || cur_char == '-') {
  97     result_sign = cur_char;
  98     if (max_width > 1) {
  99       --max_width;
 100       cur_char = reader->getc();
 101     } else {
 102       // If the max width has been hit already, then the return value must be 0
 103       // since no actual digits of the number have been parsed yet.
 104       write_int_with_length(0, to_conv);
 105       return MATCHING_FAILURE;
 106     }
 107   }
 108   const bool is_negative = result_sign == '-';
 109
 110   // Base of 0 means automatically determine the base. Base of 16 may have a
 111   // prefix of "0x"
 112   if (base == 0 || base == 16) {
 113     // If the first character is 0, then it could be octal or hex.
 114     if (cur_char == '0') {
 115       is_number = true;
 116
 117       // Read the next character to check.
 118       if (max_width > 1) {
 119         --max_width;
 120         cur_char = reader->getc();
 121       } else {
 122         write_int_with_length(0, to_conv);
 123         return READ_OK;
 124       }
 125
 126       if (internal::tolower(cur_char) == 'x') {
 127         // This is a valid hex prefix.
 128
 129         is_number = false;
 130         // A valid hex prefix is not necessarily a valid number. For the
 131         // conversion to be valid it needs to use all of the characters it
 132         // consumes. From the standard:
 133         // 7.23.6.2 paragraph 9: "An input item is defined as the longest
 134         // sequence of input characters which does not exceed any specified
 135         // field width and which is, or is a prefix of, a matching input
 136         // sequence."
 137         // 7.23.6.2 paragraph 10: "If the input item is not a matching sequence,
 138         // the execution of the directive fails: this condition is a matching
 139         // failure"
 140         base = 16;
 141         if (max_width > 1) {
 142           --max_width;
 143           cur_char = reader->getc();
 144         } else {
 145           return MATCHING_FAILURE;
 146         }
 147
 148       } else {
 149         if (base == 0) {
 150           base = 8;
 151         }
 152       }
 153     } else if (base == 0) {
 154       if (internal::isdigit(cur_char)) {
 155         // If the first character is a different number, then it's 10.
 156         base = 10;
 157       } else {
 158         // If the first character isn't a valid digit, then there are no valid
 159         // digits at all. The number is 0.
 160         reader->ungetc(cur_char);
 161         write_int_with_length(0, to_conv);
 162         return MATCHING_FAILURE;
 163       }
 164     }
 165   }
 166
 167   constexpr uintmax_t UNSIGNED_MAX = cpp::numeric_limits<uintmax_t>::max();
 168   constexpr uintmax_t SIGNED_MAX =
 169       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max());
 170   constexpr uintmax_t NEGATIVE_SIGNED_MAX =
 171       static_cast<uintmax_t>(cpp::numeric_limits<intmax_t>::max()) + 1;
 172
 173   const uintmax_t MAX =
 174       (is_signed ? (is_negative ? NEGATIVE_SIGNED_MAX : SIGNED_MAX)
 175                  : UNSIGNED_MAX);
 176
 177   const uintmax_t max_div_by_base = MAX / base;
 178
 179   if (internal::isalnum(cur_char) &&
 180       internal::b36_char_to_int(cur_char) < base) {
 181     is_number = true;
 182   }
 183
 184   bool has_overflow = false;
 185   size_t i = 0;
 186   for (; i < max_width && internal::isalnum(cur_char) &&
 187          internal::b36_char_to_int(cur_char) < base;
 188        ++i, cur_char = reader->getc()) {
 189
 190     uintmax_t cur_digit = internal::b36_char_to_int(cur_char);
 191
 192     if (result == MAX) {
 193       has_overflow = true;
 194       continue;
 195     } else if (result > max_div_by_base) {
 196       result = MAX;
 197       has_overflow = true;
 198     } else {
 199       result = result * base;
 200     }
 201
 202     if (result > MAX - cur_digit) {
 203       result = MAX;
 204       has_overflow = true;
 205     } else {
 206       result = result + cur_digit;
 207     }
 208   }
 209
 210   // We always read one more character than will be used, so we have to put the
 211   // last one back.
 212   reader->ungetc(cur_char);
 213
 214   if (!is_number)
 215     return MATCHING_FAILURE;
 216
 217   if (has_overflow) {
 218     write_int_with_length(MAX, to_conv);
 219   } else {
 220     if (is_negative)
 221       result = -result;
 222
 223     write_int_with_length(result, to_conv);
 224   }
 225
 226   return READ_OK;
 227 }
 228
 229 } // namespace scanf_core
 230 } // namespace LIBC_NAMESPACE_DECL