Blink roll 25b6bd3a7a131ffe68d809546ad1a20707915cdc:3a503f41ae42e5b79cfcd2ff10e65afde...
[chromium-blink-merge.git] / content / common / android / address_parser.cc
blob30fa304ba4cd7d1219b5c244d1343bd0f953ae3e
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/common/android/address_parser.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "content/common/android/address_parser_internal.h"
11 namespace {
13 // Minimum number of words in an address after the house number
14 // before a state is expected to be found.
15 // A value too high can miss short addresses.
16 const size_t kMinAddressWords = 3;
18 // Maximum number of words allowed in an address between the house number
19 // and the state, both not included.
20 const size_t kMaxAddressWords = 12;
22 // Maximum number of lines allowed in an address between the house number
23 // and the state, both not included.
24 const size_t kMaxAddressLines = 5;
26 // Maximum length allowed for any address word between the house number
27 // and the state, both not included.
28 const size_t kMaxAddressNameWordLength = 25;
30 // Maximum number of words after the house number in which the location name
31 // should be found.
32 const size_t kMaxLocationNameDistance = 4;
34 // Additional characters used as new line delimiters.
35 const base::char16 kNewlineDelimiters[] = {
36 '\n',
37 ',',
38 '*',
39 0x2022, // Unicode bullet
43 } // anonymous namespace
45 namespace content {
47 namespace address_parser {
49 using namespace internal;
51 bool FindAddress(const base::string16& text, base::string16* address) {
52 size_t start, end;
53 if (FindAddress(text.begin(), text.end(), &start, &end)) {
54 size_t len = end >= start ? end - start : 0;
55 address->assign(text.substr(start, len));
56 return true;
58 return false;
61 bool FindAddress(const base::string16::const_iterator& begin,
62 const base::string16::const_iterator& end,
63 size_t* start_pos,
64 size_t* end_pos) {
65 HouseNumberParser house_number_parser;
67 // Keep going through the input string until a potential house number is
68 // detected. Start tokenizing the following words to find a valid
69 // street name within a word range. Then, find a state name followed
70 // by a valid zip code for that state. Also keep a look for any other
71 // possible house numbers to continue from in case of no match and for
72 // state names not followed by a zip code (e.g. New York, NY 10000).
73 const base::string16 newline_delimiters = kNewlineDelimiters;
74 const base::string16 delimiters = base::kWhitespaceUTF16 + newline_delimiters;
75 for (base::string16::const_iterator it = begin; it != end; ) {
76 Word house_number;
77 if (!house_number_parser.Parse(it, end, &house_number))
78 return false;
80 String16Tokenizer tokenizer(house_number.end, end, delimiters);
81 tokenizer.set_options(String16Tokenizer::RETURN_DELIMS);
83 WordList words;
84 words.push_back(house_number);
86 bool found_location_name = false;
87 bool continue_on_house_number = true;
88 bool consecutive_house_numbers = true;
89 size_t next_house_number_word = 0;
90 size_t num_lines = 1;
92 // Don't include the house number in the word count.
93 size_t next_word = 1;
94 for (; next_word <= kMaxAddressWords + 1; ++next_word) {
96 // Extract a new word from the tokenizer.
97 if (next_word == words.size()) {
98 do {
99 if (!tokenizer.GetNext())
100 return false;
102 // Check the number of address lines.
103 if (tokenizer.token_is_delim() && newline_delimiters.find(
104 *tokenizer.token_begin()) != base::string16::npos) {
105 ++num_lines;
107 } while (tokenizer.token_is_delim());
109 if (num_lines > kMaxAddressLines)
110 break;
112 words.push_back(Word(tokenizer.token_begin(), tokenizer.token_end()));
115 // Check the word length. If too long, don't try to continue from
116 // the next house number as no address can hold this word.
117 const Word& current_word = words[next_word];
118 DCHECK_GT(std::distance(current_word.begin, current_word.end), 0);
119 size_t current_word_length = std::distance(
120 current_word.begin, current_word.end);
121 if (current_word_length > kMaxAddressNameWordLength) {
122 continue_on_house_number = false;
123 break;
126 // Check if the new word is a valid house number.
127 if (house_number_parser.Parse(current_word.begin, current_word.end,
128 NULL)) {
129 // Increase the number of consecutive house numbers since the beginning.
130 if (consecutive_house_numbers) {
131 // Check if there is a new line between consecutive house numbers.
132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
133 if (num_lines > 1) {
134 next_house_number_word = next_word;
135 break;
139 // Keep the next candidate to resume parsing from in case of failure.
140 if (next_house_number_word == 0) {
141 next_house_number_word = next_word;
142 continue;
144 } else {
145 consecutive_house_numbers = false;
148 // Look for location names in the words after the house number.
149 // A range limitation is introduced to avoid matching
150 // anything that starts with a number before a legitimate address.
151 if (next_word <= kMaxLocationNameDistance &&
152 IsValidLocationName(current_word)) {
153 found_location_name = true;
154 continue;
157 // Don't count the house number.
158 if (next_word > kMinAddressWords) {
159 // Looking for the state is likely to add new words to the list while
160 // checking for multi-word state names.
161 size_t state_first_word = next_word;
162 size_t state_last_word, state_index;
163 if (FindStateStartingInWord(&words, state_first_word, &state_last_word,
164 &tokenizer, &state_index)) {
166 // A location name should have been found at this point.
167 if (!found_location_name)
168 break;
170 // Explicitly exclude "et al", as "al" is a valid state code.
171 if (current_word_length == 2 && words.size() > 2) {
172 const Word& previous_word = words[state_first_word - 1];
173 if (previous_word.end - previous_word.begin == 2 &&
174 LowerCaseEqualsASCII(previous_word.begin, previous_word.end,
175 "et") &&
176 LowerCaseEqualsASCII(current_word.begin, current_word.end,
177 "al"))
178 break;
181 // Extract one more word from the tokenizer if not already available.
182 size_t zip_word = state_last_word + 1;
183 if (zip_word == words.size()) {
184 do {
185 if (!tokenizer.GetNext())
186 return false;
187 } while (tokenizer.token_is_delim());
188 words.push_back(Word(tokenizer.token_begin(),
189 tokenizer.token_end()));
192 // Check the parsing validity and state range of the zip code.
193 next_word = state_last_word;
194 if (!IsZipValid(words[zip_word], state_index))
195 continue;
197 *start_pos = words[0].begin - begin;
198 *end_pos = words[zip_word].end - begin;
199 return true;
204 // Avoid skipping too many words because of a non-address number
205 // at the beginning of the contents to parse.
206 if (continue_on_house_number && next_house_number_word > 0) {
207 it = words[next_house_number_word].begin;
208 } else {
209 DCHECK(!words.empty());
210 next_word = std::min(next_word, words.size() - 1);
211 it = words[next_word].end;
215 return false;
218 } // namespace address_parser
220 } // namespace content