1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 // FormAutofillNameUtils is initially translated from
6 // https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
7 export var FormAutofillNameUtils = {
68 FAMILY_NAME_PREFIXES: [
84 // The common and non-ambiguous CJK surnames (last names) that have more than
86 COMMON_CJK_MULTI_CHAR_SURNAMES: [
87 // Korean, taken from the list of surnames:
88 // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
98 // Chinese, taken from the top 10 Chinese 2-character surnames:
99 // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
100 // Simplified Chinese (mostly mainland China)
111 // Traditional Chinese (mostly Taiwan)
122 // All Korean surnames that have more than one character, even the
123 // rare/ambiguous ones.
124 KOREAN_MULTI_CHAR_SURNAMES: [
141 // The whitespace definition based on
142 // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
144 "\u0009", // CHARACTER TABULATION
145 "\u000A", // LINE FEED (LF)
146 "\u000B", // LINE TABULATION
147 "\u000C", // FORM FEED (FF)
148 "\u000D", // CARRIAGE RETURN (CR)
150 "\u0085", // NEXT LINE (NEL)
151 "\u00A0", // NO-BREAK SPACE
152 "\u1680", // OGHAM SPACE MARK
155 "\u2002", // EN SPACE
156 "\u2003", // EM SPACE
157 "\u2004", // THREE-PER-EM SPACE
158 "\u2005", // FOUR-PER-EM SPACE
159 "\u2006", // SIX-PER-EM SPACE
160 "\u2007", // FIGURE SPACE
161 "\u2008", // PUNCTUATION SPACE
162 "\u2009", // THIN SPACE
163 "\u200A", // HAIR SPACE
164 "\u2028", // LINE SEPARATOR
165 "\u2029", // PARAGRAPH SEPARATOR
166 "\u202F", // NARROW NO-BREAK SPACE
167 "\u205F", // MEDIUM MATHEMATICAL SPACE
168 "\u3000", // IDEOGRAPHIC SPACE
171 // The middle dot is used as a separator for foreign names in Japanese.
173 "\u30FB", // KATAKANA MIDDLE DOT
174 "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
177 // The Unicode range is based on Wiki:
178 // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
179 // https://en.wikipedia.org/wiki/Hangul
180 // https://en.wikipedia.org/wiki/Japanese_writing_system
182 "\u1100-\u11FF", // Hangul Jamo
183 "\u3040-\u309F", // Hiragana
184 "\u30A0-\u30FF", // Katakana
185 "\u3105-\u312C", // Bopomofo
186 "\u3130-\u318F", // Hangul Compatibility Jamo
187 "\u31F0-\u31FF", // Katakana Phonetic Extensions
188 "\u3200-\u32FF", // Enclosed CJK Letters and Months
189 "\u3400-\u4DBF", // CJK unified ideographs Extension A
190 "\u4E00-\u9FFF", // CJK Unified Ideographs
191 "\uA960-\uA97F", // Hangul Jamo Extended-A
192 "\uAC00-\uD7AF", // Hangul Syllables
193 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
194 "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
198 "\u1100-\u11FF", // Hangul Jamo
199 "\u3130-\u318F", // Hangul Compatibility Jamo
200 "\uA960-\uA97F", // Hangul Jamo Extended-A
201 "\uAC00-\uD7AF", // Hangul Syllables
202 "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
207 // Returns true if |set| contains |token|, modulo a final period.
208 _containsString(set, token) {
209 let target = token.replace(/\.$/, "").toLowerCase();
210 return set.includes(target);
213 // Removes common name prefixes from |name_tokens|.
214 _stripPrefixes(nameTokens) {
215 for (let i in nameTokens) {
216 if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
217 return nameTokens.slice(i);
223 // Removes common name suffixes from |name_tokens|.
224 _stripSuffixes(nameTokens) {
225 for (let i = nameTokens.length - 1; i >= 0; i--) {
226 if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
227 return nameTokens.slice(0, i + 1);
234 // The name is considered to be a CJK name if it is only CJK characters,
235 // spaces, and "middle dot" separators, with at least one CJK character, and
236 // no more than 2 words.
238 // Chinese and Japanese names are usually spelled out using the Han
239 // characters (logographs), which constitute the "CJK Unified Ideographs"
240 // block in Unicode, also referred to as Unihan. Korean names are usually
241 // spelled out in the Korean alphabet (Hangul), although they do have a Han
242 // equivalent as well.
248 let previousWasCJK = false;
251 for (let c of name) {
252 let isMiddleDot = this.MIDDLE_DOT.includes(c);
253 let isCJK = !isMiddleDot && this.reCJK.test(c);
254 if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
257 if (isCJK && !previousWasCJK) {
260 previousWasCJK = isCJK;
263 return wordCount > 0 && wordCount < 3;
266 // Tries to split a Chinese, Japanese, or Korean name into its given name &
267 // surname parts. If splitting did not work for whatever reason, returns null.
268 _splitCJKName(nameTokens) {
269 // The convention for CJK languages is to put the surname (last name) first,
270 // and the given name (first name) second. In a continuous text, there is
271 // normally no space between the two parts of the name. When entering their
272 // name into a field, though, some people add a space to disambiguate. CJK
273 // names (almost) never have a middle name.
275 let reHangulName = new RegExp(
276 "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
285 if (nameTokens.length == 1) {
286 // There is no space between the surname and given name. Try to infer
287 // where to separate between the two. Most Chinese and Korean surnames
288 // have only one character, but there are a few that have 2. If the name
289 // does not start with a surname from a known list, default to one
291 let name = nameTokens[0];
292 let isKorean = reHangulName.test(name);
293 let surnameLength = 0;
295 // 4-character Korean names are more likely to be 2/2 than 1/3, so use
296 // the full list of Korean 2-char surnames. (instead of only the common
298 let multiCharSurnames =
299 isKorean && name.length > 3
300 ? this.KOREAN_MULTI_CHAR_SURNAMES
301 : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
303 // Default to 1 character if the surname is not in the list.
304 surnameLength = multiCharSurnames.some(surname =>
305 name.startsWith(surname)
310 nameParts.family = name.substr(0, surnameLength);
311 nameParts.given = name.substr(surnameLength);
312 } else if (nameTokens.length == 2) {
313 // The user entered a space between the two name parts. This makes our job
314 // easier. Family name first, given name second.
315 nameParts.family = nameTokens[0];
316 nameParts.given = nameTokens[1];
325 if (this._dataLoaded) {
328 this._dataLoaded = true;
330 this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
344 let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
345 nameTokens = this._stripPrefixes(nameTokens);
347 if (this._isCJKName(name)) {
348 let parts = this._splitCJKName(nameTokens);
354 // Don't assume "Ma" is a suffix in John Ma.
355 if (nameTokens.length > 2) {
356 nameTokens = this._stripSuffixes(nameTokens);
359 if (!nameTokens.length) {
360 // Bad things have happened; just assume the whole thing is a given name.
361 nameParts.given = name;
365 // Only one token, assume given name.
366 if (nameTokens.length == 1) {
367 nameParts.given = nameTokens[0];
371 // 2 or more tokens. Grab the family, which is the last word plus any
372 // recognizable family prefixes.
373 let familyTokens = [nameTokens.pop()];
374 while (nameTokens.length) {
375 let lastToken = nameTokens[nameTokens.length - 1];
376 if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
379 familyTokens.unshift(lastToken);
382 nameParts.family = familyTokens.join(" ");
384 // Take the last remaining token as the middle name (if there are at least 2
386 if (nameTokens.length >= 2) {
387 nameParts.middle = nameTokens.pop();
390 // Remainder is given name.
391 nameParts.given = nameTokens.join(" ");
396 joinNameParts({ given, middle, family }) {
397 if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
398 return family + given;
400 return [given, middle, family]
401 .filter(part => part && part.length)
406 FormAutofillNameUtils.init();