1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/common/android/address_parser.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "content/common/android/address_parser_internal.h"
13 // Minimum number of words in an address after the house number
14 // before a state is expected to be found.
15 // A value too high can miss short addresses.
16 const size_t kMinAddressWords
= 3;
18 // Maximum number of words allowed in an address between the house number
19 // and the state, both not included.
20 const size_t kMaxAddressWords
= 12;
22 // Maximum number of lines allowed in an address between the house number
23 // and the state, both not included.
24 const size_t kMaxAddressLines
= 5;
26 // Maximum length allowed for any address word between the house number
27 // and the state, both not included.
28 const size_t kMaxAddressNameWordLength
= 25;
30 // Maximum number of words after the house number in which the location name
32 const size_t kMaxLocationNameDistance
= 4;
34 // Additional characters used as new line delimiters.
35 const base::char16 kNewlineDelimiters
[] = {
39 0x2022, // Unicode bullet
43 } // anonymous namespace
47 namespace address_parser
{
49 using namespace internal
;
51 bool FindAddress(const base::string16
& text
, base::string16
* address
) {
53 if (FindAddress(text
.begin(), text
.end(), &start
, &end
)) {
54 address
->assign(text
.substr(start
, end
));
60 bool FindAddress(const base::string16::const_iterator
& begin
,
61 const base::string16::const_iterator
& end
,
64 HouseNumberParser house_number_parser
;
66 // Keep going through the input string until a potential house number is
67 // detected. Start tokenizing the following words to find a valid
68 // street name within a word range. Then, find a state name followed
69 // by a valid zip code for that state. Also keep a look for any other
70 // possible house numbers to continue from in case of no match and for
71 // state names not followed by a zip code (e.g. New York, NY 10000).
72 const base::string16 newline_delimiters
= kNewlineDelimiters
;
73 const base::string16 delimiters
= base::kWhitespaceUTF16
+ newline_delimiters
;
74 for (base::string16::const_iterator it
= begin
; it
!= end
; ) {
76 if (!house_number_parser
.Parse(it
, end
, &house_number
))
79 String16Tokenizer
tokenizer(house_number
.end
, end
, delimiters
);
80 tokenizer
.set_options(String16Tokenizer::RETURN_DELIMS
);
83 words
.push_back(house_number
);
85 bool found_location_name
= false;
86 bool continue_on_house_number
= true;
87 bool consecutive_house_numbers
= true;
88 size_t next_house_number_word
= 0;
91 // Don't include the house number in the word count.
93 for (; next_word
<= kMaxAddressWords
+ 1; ++next_word
) {
95 // Extract a new word from the tokenizer.
96 if (next_word
== words
.size()) {
98 if (!tokenizer
.GetNext())
101 // Check the number of address lines.
102 if (tokenizer
.token_is_delim() && newline_delimiters
.find(
103 *tokenizer
.token_begin()) != base::string16::npos
) {
106 } while (tokenizer
.token_is_delim());
108 if (num_lines
> kMaxAddressLines
)
111 words
.push_back(Word(tokenizer
.token_begin(), tokenizer
.token_end()));
114 // Check the word length. If too long, don't try to continue from
115 // the next house number as no address can hold this word.
116 const Word
& current_word
= words
[next_word
];
117 DCHECK_GT(std::distance(current_word
.begin
, current_word
.end
), 0);
118 size_t current_word_length
= std::distance(
119 current_word
.begin
, current_word
.end
);
120 if (current_word_length
> kMaxAddressNameWordLength
) {
121 continue_on_house_number
= false;
125 // Check if the new word is a valid house number.
126 if (house_number_parser
.Parse(current_word
.begin
, current_word
.end
,
128 // Increase the number of consecutive house numbers since the beginning.
129 if (consecutive_house_numbers
) {
130 // Check if there is a new line between consecutive house numbers.
131 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
133 next_house_number_word
= next_word
;
138 // Keep the next candidate to resume parsing from in case of failure.
139 if (next_house_number_word
== 0) {
140 next_house_number_word
= next_word
;
144 consecutive_house_numbers
= false;
147 // Look for location names in the words after the house number.
148 // A range limitation is introduced to avoid matching
149 // anything that starts with a number before a legitimate address.
150 if (next_word
<= kMaxLocationNameDistance
&&
151 IsValidLocationName(current_word
)) {
152 found_location_name
= true;
156 // Don't count the house number.
157 if (next_word
> kMinAddressWords
) {
158 // Looking for the state is likely to add new words to the list while
159 // checking for multi-word state names.
160 size_t state_first_word
= next_word
;
161 size_t state_last_word
, state_index
;
162 if (FindStateStartingInWord(&words
, state_first_word
, &state_last_word
,
163 &tokenizer
, &state_index
)) {
165 // A location name should have been found at this point.
166 if (!found_location_name
)
169 // Explicitly exclude "et al", as "al" is a valid state code.
170 if (current_word_length
== 2 && words
.size() > 2) {
171 const Word
& previous_word
= words
[state_first_word
- 1];
172 if (previous_word
.end
- previous_word
.begin
== 2 &&
173 LowerCaseEqualsASCII(previous_word
.begin
, previous_word
.end
,
175 LowerCaseEqualsASCII(current_word
.begin
, current_word
.end
,
180 // Extract one more word from the tokenizer if not already available.
181 size_t zip_word
= state_last_word
+ 1;
182 if (zip_word
== words
.size()) {
184 if (!tokenizer
.GetNext())
186 } while (tokenizer
.token_is_delim());
187 words
.push_back(Word(tokenizer
.token_begin(),
188 tokenizer
.token_end()));
191 // Check the parsing validity and state range of the zip code.
192 next_word
= state_last_word
;
193 if (!IsZipValid(words
[zip_word
], state_index
))
196 *start_pos
= words
[0].begin
- begin
;
197 *end_pos
= words
[zip_word
].end
- begin
;
203 // Avoid skipping too many words because of a non-address number
204 // at the beginning of the contents to parse.
205 if (continue_on_house_number
&& next_house_number_word
> 0) {
206 it
= words
[next_house_number_word
].begin
;
208 DCHECK(!words
.empty());
209 next_word
= std::min(next_word
, words
.size() - 1);
210 it
= words
[next_word
].end
;
217 } // namespace address_parser
219 } // namespace content