content/common/android/address_parser.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "content/common/android/address_parser.h"
   6
   7 #include "base/logging.h"
   8 #include "base/strings/string_util.h"
   9 #include "content/common/android/address_parser_internal.h"
  10
  11 namespace {
  12
  13 // Minimum number of words in an address after the house number
  14 // before a state is expected to be found.
  15 // A value too high can miss short addresses.
  16 const size_t kMinAddressWords = 3;
  17
  18 // Maximum number of words allowed in an address between the house number
  19 // and the state, both not included.
  20 const size_t kMaxAddressWords = 12;
  21
  22 // Maximum number of lines allowed in an address between the house number
  23 // and the state, both not included.
  24 const size_t kMaxAddressLines = 5;
  25
  26 // Maximum length allowed for any address word between the house number
  27 // and the state, both not included.
  28 const size_t kMaxAddressNameWordLength = 25;
  29
  30 // Maximum number of words after the house number in which the location name
  31 // should be found.
  32 const size_t kMaxLocationNameDistance = 4;
  33
  34 // Additional characters used as new line delimiters.
  35 const base::char16 kNewlineDelimiters[] = {
  36   '\n',
  37   ',',
  38   '*',
  39   0x2022,  // Unicode bullet
  40   0,
  41 };
  42
  43 }  // anonymous namespace
  44
  45 namespace content {
  46
  47 namespace address_parser {
  48
  49 using namespace internal;
  50
  51 bool FindAddress(const base::string16& text, base::string16* address) {
  52   size_t start, end;
  53   if (FindAddress(text.begin(), text.end(), &start, &end)) {
  54     size_t len = end >= start ? end - start : 0;
  55     address->assign(text.substr(start, len));
  56     return true;
  57   }
  58   return false;
  59 }
  60
  61 bool FindAddress(const base::string16::const_iterator& begin,
  62                  const base::string16::const_iterator& end,
  63                  size_t* start_pos,
  64                  size_t* end_pos) {
  65   HouseNumberParser house_number_parser;
  66
  67   // Keep going through the input string until a potential house number is
  68   // detected. Start tokenizing the following words to find a valid
  69   // street name within a word range. Then, find a state name followed
  70   // by a valid zip code for that state. Also keep a look for any other
  71   // possible house numbers to continue from in case of no match and for
  72   // state names not followed by a zip code (e.g. New York, NY 10000).
  73   const base::string16 newline_delimiters = kNewlineDelimiters;
  74   const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
  75   for (base::string16::const_iterator it = begin; it != end; ) {
  76     Word house_number;
  77     if (!house_number_parser.Parse(it, end, &house_number))
  78       return false;
  79
  80     String16Tokenizer tokenizer(house_number.end, end, delimiters);
  81     tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
  82
  83     WordList words;
  84     words.push_back(house_number);
  85
  86     bool found_location_name = false;
  87     bool continue_on_house_number = true;
  88     bool consecutive_house_numbers = true;
  89     size_t next_house_number_word = 0;
  90     size_t num_lines = 1;
  91
  92     // Don't include the house number in the word count.
  93     size_t next_word = 1;
  94     for (; next_word <= kMaxAddressWords + 1; ++next_word) {
  95
  96       // Extract a new word from the tokenizer.
  97       if (next_word == words.size()) {
  98         do {
  99           if (!tokenizer.GetNext())
 100             return false;
 101
 102           // Check the number of address lines.
 103           if (tokenizer.token_is_delim() && newline_delimiters.find(
 104               *tokenizer.token_begin()) != base::string16::npos) {
 105             ++num_lines;
 106           }
 107         } while (tokenizer.token_is_delim());
 108
 109         if (num_lines > kMaxAddressLines)
 110           break;
 111
 112         words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
 113       }
 114
 115       // Check the word length. If too long, don't try to continue from
 116       // the next house number as no address can hold this word.
 117       const Word& current_word = words[next_word];
 118       DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
 119       size_t current_word_length = std::distance(
 120           current_word.begin, current_word.end);
 121       if (current_word_length > kMaxAddressNameWordLength) {
 122         continue_on_house_number = false;
 123         break;
 124       }
 125
 126       // Check if the new word is a valid house number.
 127       if (house_number_parser.Parse(current_word.begin, current_word.end,
 128           NULL)) {
 129         // Increase the number of consecutive house numbers since the beginning.
 130         if (consecutive_house_numbers) {
 131           // Check if there is a new line between consecutive house numbers.
 132           // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
 133           if (num_lines > 1) {
 134             next_house_number_word = next_word;
 135             break;
 136           }
 137         }
 138
 139         // Keep the next candidate to resume parsing from in case of failure.
 140         if (next_house_number_word == 0) {
 141           next_house_number_word = next_word;
 142           continue;
 143         }
 144       } else {
 145         consecutive_house_numbers = false;
 146       }
 147
 148       // Look for location names in the words after the house number.
 149       // A range limitation is introduced to avoid matching
 150       // anything that starts with a number before a legitimate address.
 151       if (next_word <= kMaxLocationNameDistance &&
 152           IsValidLocationName(current_word)) {
 153         found_location_name = true;
 154         continue;
 155       }
 156
 157       // Don't count the house number.
 158       if (next_word > kMinAddressWords) {
 159         // Looking for the state is likely to add new words to the list while
 160         // checking for multi-word state names.
 161         size_t state_first_word = next_word;
 162         size_t state_last_word, state_index;
 163         if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
 164                                     &tokenizer, &state_index)) {
 165
 166           // A location name should have been found at this point.
 167           if (!found_location_name)
 168             break;
 169
 170           // Explicitly exclude "et al", as "al" is a valid state code.
 171           if (current_word_length == 2 && words.size() > 2) {
 172             const Word& previous_word = words[state_first_word - 1];
 173             if (previous_word.end - previous_word.begin == 2 &&
 174                 base::LowerCaseEqualsASCII(
 175                     base::StringPiece16(previous_word.begin, previous_word.end),
 176                     "et") &&
 177                 base::LowerCaseEqualsASCII(
 178                      base::StringPiece16(current_word.begin, current_word.end),
 179                      "al"))
 180               break;
 181           }
 182
 183           // Extract one more word from the tokenizer if not already available.
 184           size_t zip_word = state_last_word + 1;
 185           if (zip_word == words.size()) {
 186             do {
 187               if (!tokenizer.GetNext()) {
 188                 // The address ends with a state name without a zip code. This
 189                 // is legal according to WebView#findAddress public
 190                 // documentation.
 191                 *start_pos = words[0].begin - begin;
 192                 *end_pos = words[state_last_word].end - begin;
 193                 return true;
 194               }
 195             } while (tokenizer.token_is_delim());
 196             words.push_back(Word(tokenizer.token_begin(),
 197                             tokenizer.token_end()));
 198           }
 199
 200           // Check the parsing validity and state range of the zip code.
 201           next_word = state_last_word;
 202           if (!IsZipValid(words[zip_word], state_index))
 203             continue;
 204
 205           *start_pos = words[0].begin - begin;
 206           *end_pos = words[zip_word].end - begin;
 207           return true;
 208         }
 209       }
 210     }
 211
 212     // Avoid skipping too many words because of a non-address number
 213     // at the beginning of the contents to parse.
 214     if (continue_on_house_number && next_house_number_word > 0) {
 215       it = words[next_house_number_word].begin;
 216     } else {
 217       DCHECK(!words.empty());
 218       next_word = std::min(next_word, words.size() - 1);
 219       it = words[next_word].end;
 220     }
 221   }
 222
 223   return false;
 224 }
 225
 226 }  // namespace address_parser
 227
 228 }  // namespace content