1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/common/android/address_parser.h"
7 #include "base/logging.h"
8 #include "base/strings/string_util.h"
9 #include "content/common/android/address_parser_internal.h"
13 // Minimum number of words in an address after the house number
14 // before a state is expected to be found.
15 // A value too high can miss short addresses.
16 const size_t kMinAddressWords
= 3;
18 // Maximum number of words allowed in an address between the house number
19 // and the state, both not included.
20 const size_t kMaxAddressWords
= 12;
22 // Maximum number of lines allowed in an address between the house number
23 // and the state, both not included.
24 const size_t kMaxAddressLines
= 5;
26 // Maximum length allowed for any address word between the house number
27 // and the state, both not included.
28 const size_t kMaxAddressNameWordLength
= 25;
30 // Maximum number of words after the house number in which the location name
32 const size_t kMaxLocationNameDistance
= 4;
34 // Additional characters used as new line delimiters.
35 const base::char16 kNewlineDelimiters
[] = {
39 0x2022, // Unicode bullet
43 } // anonymous namespace
47 namespace address_parser
{
49 using namespace internal
;
51 bool FindAddress(const base::string16
& text
, base::string16
* address
) {
53 if (FindAddress(text
.begin(), text
.end(), &start
, &end
)) {
54 size_t len
= end
>= start
? end
- start
: 0;
55 address
->assign(text
.substr(start
, len
));
61 bool FindAddress(const base::string16::const_iterator
& begin
,
62 const base::string16::const_iterator
& end
,
65 HouseNumberParser house_number_parser
;
67 // Keep going through the input string until a potential house number is
68 // detected. Start tokenizing the following words to find a valid
69 // street name within a word range. Then, find a state name followed
70 // by a valid zip code for that state. Also keep a look for any other
71 // possible house numbers to continue from in case of no match and for
72 // state names not followed by a zip code (e.g. New York, NY 10000).
73 const base::string16 newline_delimiters
= kNewlineDelimiters
;
74 const base::string16 delimiters
= base::kWhitespaceUTF16
+ newline_delimiters
;
75 for (base::string16::const_iterator it
= begin
; it
!= end
; ) {
77 if (!house_number_parser
.Parse(it
, end
, &house_number
))
80 String16Tokenizer
tokenizer(house_number
.end
, end
, delimiters
);
81 tokenizer
.set_options(String16Tokenizer::RETURN_DELIMS
);
84 words
.push_back(house_number
);
86 bool found_location_name
= false;
87 bool continue_on_house_number
= true;
88 bool consecutive_house_numbers
= true;
89 size_t next_house_number_word
= 0;
92 // Don't include the house number in the word count.
94 for (; next_word
<= kMaxAddressWords
+ 1; ++next_word
) {
96 // Extract a new word from the tokenizer.
97 if (next_word
== words
.size()) {
99 if (!tokenizer
.GetNext())
102 // Check the number of address lines.
103 if (tokenizer
.token_is_delim() && newline_delimiters
.find(
104 *tokenizer
.token_begin()) != base::string16::npos
) {
107 } while (tokenizer
.token_is_delim());
109 if (num_lines
> kMaxAddressLines
)
112 words
.push_back(Word(tokenizer
.token_begin(), tokenizer
.token_end()));
115 // Check the word length. If too long, don't try to continue from
116 // the next house number as no address can hold this word.
117 const Word
& current_word
= words
[next_word
];
118 DCHECK_GT(std::distance(current_word
.begin
, current_word
.end
), 0);
119 size_t current_word_length
= std::distance(
120 current_word
.begin
, current_word
.end
);
121 if (current_word_length
> kMaxAddressNameWordLength
) {
122 continue_on_house_number
= false;
126 // Check if the new word is a valid house number.
127 if (house_number_parser
.Parse(current_word
.begin
, current_word
.end
,
129 // Increase the number of consecutive house numbers since the beginning.
130 if (consecutive_house_numbers
) {
131 // Check if there is a new line between consecutive house numbers.
132 // This avoids false positives of the form "Cafe 21\n 750 Fifth Ave.."
134 next_house_number_word
= next_word
;
139 // Keep the next candidate to resume parsing from in case of failure.
140 if (next_house_number_word
== 0) {
141 next_house_number_word
= next_word
;
145 consecutive_house_numbers
= false;
148 // Look for location names in the words after the house number.
149 // A range limitation is introduced to avoid matching
150 // anything that starts with a number before a legitimate address.
151 if (next_word
<= kMaxLocationNameDistance
&&
152 IsValidLocationName(current_word
)) {
153 found_location_name
= true;
157 // Don't count the house number.
158 if (next_word
> kMinAddressWords
) {
159 // Looking for the state is likely to add new words to the list while
160 // checking for multi-word state names.
161 size_t state_first_word
= next_word
;
162 size_t state_last_word
, state_index
;
163 if (FindStateStartingInWord(&words
, state_first_word
, &state_last_word
,
164 &tokenizer
, &state_index
)) {
166 // A location name should have been found at this point.
167 if (!found_location_name
)
170 // Explicitly exclude "et al", as "al" is a valid state code.
171 if (current_word_length
== 2 && words
.size() > 2) {
172 const Word
& previous_word
= words
[state_first_word
- 1];
173 if (previous_word
.end
- previous_word
.begin
== 2 &&
174 base::LowerCaseEqualsASCII(
175 base::StringPiece16(previous_word
.begin
, previous_word
.end
),
177 base::LowerCaseEqualsASCII(
178 base::StringPiece16(current_word
.begin
, current_word
.end
),
183 // Extract one more word from the tokenizer if not already available.
184 size_t zip_word
= state_last_word
+ 1;
185 if (zip_word
== words
.size()) {
187 if (!tokenizer
.GetNext()) {
188 // The address ends with a state name without a zip code. This
189 // is legal according to WebView#findAddress public
191 *start_pos
= words
[0].begin
- begin
;
192 *end_pos
= words
[state_last_word
].end
- begin
;
195 } while (tokenizer
.token_is_delim());
196 words
.push_back(Word(tokenizer
.token_begin(),
197 tokenizer
.token_end()));
200 // Check the parsing validity and state range of the zip code.
201 next_word
= state_last_word
;
202 if (!IsZipValid(words
[zip_word
], state_index
))
205 *start_pos
= words
[0].begin
- begin
;
206 *end_pos
= words
[zip_word
].end
- begin
;
212 // Avoid skipping too many words because of a non-address number
213 // at the beginning of the contents to parse.
214 if (continue_on_house_number
&& next_house_number_word
> 0) {
215 it
= words
[next_house_number_word
].begin
;
217 DCHECK(!words
.empty());
218 next_word
= std::min(next_word
, words
.size() - 1);
219 it
= words
[next_word
].end
;
226 } // namespace address_parser
228 } // namespace content