1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/autofill/core/browser/address_field.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string16.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "components/autofill/core/browser/autofill_field.h"
15 #include "components/autofill/core/browser/autofill_regex_constants.h"
16 #include "components/autofill/core/browser/autofill_scanner.h"
17 #include "components/autofill/core/browser/field_types.h"
19 using base::UTF8ToUTF16
;
23 scoped_ptr
<FormField
> AddressField::Parse(AutofillScanner
* scanner
) {
27 scoped_ptr
<AddressField
> address_field(new AddressField
);
28 const AutofillField
* const initial_field
= scanner
->Cursor();
29 size_t saved_cursor
= scanner
->SaveCursor();
31 base::string16 attention_ignored
= UTF8ToUTF16(kAttentionIgnoredRe
);
32 base::string16 region_ignored
= UTF8ToUTF16(kRegionIgnoredRe
);
34 // Allow address fields to appear in any order.
35 size_t begin_trailing_non_labeled_fields
= 0;
36 bool has_trailing_non_labeled_fields
= false;
37 while (!scanner
->IsEnd()) {
38 const size_t cursor
= scanner
->SaveCursor();
39 if (address_field
->ParseAddressLines(scanner
) ||
40 address_field
->ParseCity(scanner
) ||
41 address_field
->ParseState(scanner
) ||
42 address_field
->ParseZipCode(scanner
) ||
43 address_field
->ParseCountry(scanner
) ||
44 address_field
->ParseCompany(scanner
)) {
45 has_trailing_non_labeled_fields
= false;
47 } else if (ParseField(scanner
, attention_ignored
, NULL
) ||
48 ParseField(scanner
, region_ignored
, NULL
)) {
49 // We ignore the following:
51 // * Province/Region/Other.
53 } else if (scanner
->Cursor() != initial_field
&&
54 ParseEmptyLabel(scanner
, NULL
)) {
55 // Ignore non-labeled fields within an address; the page
56 // MapQuest Driving Directions North America.html contains such a field.
57 // We only ignore such fields after we've parsed at least one other field;
58 // otherwise we'd effectively parse address fields before other field
59 // types after any non-labeled fields, and we want email address fields to
60 // have precedence since some pages contain fields labeled
62 if (!has_trailing_non_labeled_fields
) {
63 has_trailing_non_labeled_fields
= true;
64 begin_trailing_non_labeled_fields
= cursor
;
74 // If we have identified any address fields in this field then it should be
75 // added to the list of fields.
76 if (address_field
->company_
||
77 address_field
->address1_
||
78 address_field
->address2_
||
79 address_field
->address3_
||
80 address_field
->street_address_
||
81 address_field
->city_
||
82 address_field
->state_
||
83 address_field
->zip_
||
84 address_field
->zip4_
||
85 address_field
->country_
) {
86 // Don't slurp non-labeled fields at the end into the address.
87 if (has_trailing_non_labeled_fields
)
88 scanner
->RewindTo(begin_trailing_non_labeled_fields
);
90 return address_field
.Pass();
93 scanner
->RewindTo(saved_cursor
);
97 AddressField::AddressField()
102 street_address_(NULL
),
110 bool AddressField::ClassifyField(ServerFieldTypeMap
* map
) const {
111 // The page can request the address lines as a single textarea input or as
112 // multiple text fields (or not at all), but it shouldn't be possible to
114 DCHECK(!(address1_
&& street_address_
));
115 DCHECK(!(address2_
&& street_address_
));
116 DCHECK(!(address3_
&& street_address_
));
118 return AddClassification(company_
, COMPANY_NAME
, map
) &&
119 AddClassification(address1_
, ADDRESS_HOME_LINE1
, map
) &&
120 AddClassification(address2_
, ADDRESS_HOME_LINE2
, map
) &&
121 AddClassification(address3_
, ADDRESS_HOME_LINE3
, map
) &&
122 AddClassification(street_address_
, ADDRESS_HOME_STREET_ADDRESS
, map
) &&
123 AddClassification(city_
, ADDRESS_HOME_CITY
, map
) &&
124 AddClassification(state_
, ADDRESS_HOME_STATE
, map
) &&
125 AddClassification(zip_
, ADDRESS_HOME_ZIP
, map
) &&
126 AddClassification(country_
, ADDRESS_HOME_COUNTRY
, map
);
129 bool AddressField::ParseCompany(AutofillScanner
* scanner
) {
130 if (company_
&& !company_
->IsEmpty())
133 return ParseField(scanner
, UTF8ToUTF16(kCompanyRe
), &company_
);
136 bool AddressField::ParseAddressLines(AutofillScanner
* scanner
) {
137 // We only match the string "address" in page text, not in element names,
138 // because sometimes every element in a group of address fields will have
139 // a name containing the string "address"; for example, on the page
140 // Kohl's - Register Billing Address.html the text element labeled "city"
141 // has the name "BILL_TO_ADDRESS<>city". We do match address labels
142 // such as "address1", which appear as element names on various pages (eg
143 // AmericanGirl-Registration.html, BloomingdalesBilling.html,
144 // EBay Registration Enter Information.html).
145 if (address1_
|| street_address_
)
148 // Ignore "Address Lookup" field. http://crbug.com/427622
149 if (ParseField(scanner
, base::UTF8ToUTF16(kAddressLookupRe
), NULL
))
152 base::string16 pattern
= UTF8ToUTF16(kAddressLine1Re
);
153 base::string16 label_pattern
= UTF8ToUTF16(kAddressLine1LabelRe
);
154 if (!ParseFieldSpecifics(scanner
, pattern
, MATCH_DEFAULT
, &address1_
) &&
155 !ParseFieldSpecifics(scanner
, label_pattern
, MATCH_LABEL
| MATCH_TEXT
,
157 !ParseFieldSpecifics(scanner
, pattern
, MATCH_DEFAULT
| MATCH_TEXT_AREA
,
159 !ParseFieldSpecifics(scanner
, label_pattern
,
160 MATCH_LABEL
| MATCH_TEXT_AREA
,
167 // This code may not pick up pages that have an address field consisting of a
168 // sequence of unlabeled address fields. If we need to add this, see
169 // discussion on https://codereview.chromium.org/741493003/
170 pattern
= UTF8ToUTF16(kAddressLine2Re
);
171 label_pattern
= UTF8ToUTF16(kAddressLine2LabelRe
);
172 if (!ParseField(scanner
, pattern
, &address2_
) &&
173 !ParseFieldSpecifics(scanner
, label_pattern
, MATCH_LABEL
| MATCH_TEXT
,
177 // Optionally parse address line 3. This uses the same label regexp as
179 pattern
= UTF8ToUTF16(kAddressLinesExtraRe
);
180 if (!ParseField(scanner
, pattern
, &address3_
) &&
181 !ParseFieldSpecifics(scanner
, label_pattern
, MATCH_LABEL
| MATCH_TEXT
,
185 // Try for surplus lines, which we will promptly discard. Some pages have 4
186 // address lines (e.g. uk/ShoesDirect2.html)!
188 // Since these are rare, don't bother considering unlabeled lines as extra
190 pattern
= UTF8ToUTF16(kAddressLinesExtraRe
);
191 while (ParseField(scanner
, pattern
, NULL
)) {
192 // Consumed a surplus line, try for another.
197 bool AddressField::ParseCountry(AutofillScanner
* scanner
) {
198 // Parse a country. The occasional page (e.g.
199 // Travelocity_New Member Information1.html) calls this a "location".
200 if (country_
&& !country_
->IsEmpty())
203 return ParseFieldSpecifics(scanner
,
204 UTF8ToUTF16(kCountryRe
),
205 MATCH_DEFAULT
| MATCH_SELECT
,
209 bool AddressField::ParseZipCode(AutofillScanner
* scanner
) {
210 // Parse a zip code. On some UK pages (e.g. The China Shop2.html) this
211 // is called a "post code".
215 // Some sites use type="tel" for zip fields (to get a numerical input).
216 // http://crbug.com/426958
217 if (!ParseFieldSpecifics(scanner
,
218 UTF8ToUTF16(kZipCodeRe
),
219 MATCH_DEFAULT
| MATCH_TELEPHONE
,
224 // Look for a zip+4, whose field name will also often contain
225 // the substring "zip".
226 ParseFieldSpecifics(scanner
,
227 UTF8ToUTF16(kZip4Re
),
228 MATCH_DEFAULT
| MATCH_TELEPHONE
,
233 bool AddressField::ParseCity(AutofillScanner
* scanner
) {
234 // Parse a city name. Some UK pages (e.g. The China Shop2.html) use
239 // Select fields are allowed here. This occurs on top-100 site rediff.com.
240 return ParseFieldSpecifics(scanner
,
241 UTF8ToUTF16(kCityRe
),
242 MATCH_DEFAULT
| MATCH_SELECT
,
246 bool AddressField::ParseState(AutofillScanner
* scanner
) {
250 return ParseFieldSpecifics(scanner
,
251 UTF8ToUTF16(kStateRe
),
252 MATCH_DEFAULT
| MATCH_SELECT
,
256 } // namespace autofill