toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs

   1 /* This Source Code Form is subject to the terms of the Mozilla Public
   2  * License, v. 2.0. If a copy of the MPL was not distributed with this
   3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   4
   5 // FormAutofillNameUtils is initially translated from
   6 // https://cs.chromium.org/chromium/src/components/autofill/core/browser/autofill_data_util.cc?rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
   7 export var FormAutofillNameUtils = {
   8   NAME_PREFIXES: [
   9     "1lt",
  10     "1st",
  11     "2lt",
  12     "2nd",
  13     "3rd",
  14     "admiral",
  15     "capt",
  16     "captain",
  17     "col",
  18     "cpt",
  19     "dr",
  20     "gen",
  21     "general",
  22     "lcdr",
  23     "lt",
  24     "ltc",
  25     "ltg",
  26     "ltjg",
  27     "maj",
  28     "major",
  29     "mg",
  30     "mr",
  31     "mrs",
  32     "ms",
  33     "pastor",
  34     "prof",
  35     "rep",
  36     "reverend",
  37     "rev",
  38     "sen",
  39     "st",
  40   ],
  41
  42   NAME_SUFFIXES: [
  43     "b.a",
  44     "ba",
  45     "d.d.s",
  46     "dds",
  47     "i",
  48     "ii",
  49     "iii",
  50     "iv",
  51     "ix",
  52     "jr",
  53     "m.a",
  54     "m.d",
  55     "ma",
  56     "md",
  57     "ms",
  58     "ph.d",
  59     "phd",
  60     "sr",
  61     "v",
  62     "vi",
  63     "vii",
  64     "viii",
  65     "x",
  66   ],
  67
  68   FAMILY_NAME_PREFIXES: [
  69     "d'",
  70     "de",
  71     "del",
  72     "der",
  73     "di",
  74     "la",
  75     "le",
  76     "mc",
  77     "san",
  78     "st",
  79     "ter",
  80     "van",
  81     "von",
  82   ],
  83
  84   // The common and non-ambiguous CJK surnames (last names) that have more than
  85   // one character.
  86   COMMON_CJK_MULTI_CHAR_SURNAMES: [
  87     // Korean, taken from the list of surnames:
  88     // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
  89     "남궁",
  90     "사공",
  91     "서문",
  92     "선우",
  93     "제갈",
  94     "황보",
  95     "독고",
  96     "망절",
  97
  98     // Chinese, taken from the top 10 Chinese 2-character surnames:
  99     // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
 100     // Simplified Chinese (mostly mainland China)
 101     "欧阳",
 102     "令狐",
 103     "皇甫",
 104     "上官",
 105     "司徒",
 106     "诸葛",
 107     "司马",
 108     "宇文",
 109     "呼延",
 110     "端木",
 111     // Traditional Chinese (mostly Taiwan)
 112     "張簡",
 113     "歐陽",
 114     "諸葛",
 115     "申屠",
 116     "尉遲",
 117     "司馬",
 118     "軒轅",
 119     "夏侯",
 120   ],
 121
 122   // All Korean surnames that have more than one character, even the
 123   // rare/ambiguous ones.
 124   KOREAN_MULTI_CHAR_SURNAMES: [
 125     "강전",
 126     "남궁",
 127     "독고",
 128     "동방",
 129     "망절",
 130     "사공",
 131     "서문",
 132     "선우",
 133     "소봉",
 134     "어금",
 135     "장곡",
 136     "제갈",
 137     "황목",
 138     "황보",
 139   ],
 140
 141   // The whitespace definition based on
 142   // https://cs.chromium.org/chromium/src/base/strings/string_util_constants.cc?l=9&rcl=b861deff77abecff11ae6a9f6946e9cc844b9817
 143   WHITESPACE: [
 144     "\u0009", // CHARACTER TABULATION
 145     "\u000A", // LINE FEED (LF)
 146     "\u000B", // LINE TABULATION
 147     "\u000C", // FORM FEED (FF)
 148     "\u000D", // CARRIAGE RETURN (CR)
 149     "\u0020", // SPACE
 150     "\u0085", // NEXT LINE (NEL)
 151     "\u00A0", // NO-BREAK SPACE
 152     "\u1680", // OGHAM SPACE MARK
 153     "\u2000", // EN QUAD
 154     "\u2001", // EM QUAD
 155     "\u2002", // EN SPACE
 156     "\u2003", // EM SPACE
 157     "\u2004", // THREE-PER-EM SPACE
 158     "\u2005", // FOUR-PER-EM SPACE
 159     "\u2006", // SIX-PER-EM SPACE
 160     "\u2007", // FIGURE SPACE
 161     "\u2008", // PUNCTUATION SPACE
 162     "\u2009", // THIN SPACE
 163     "\u200A", // HAIR SPACE
 164     "\u2028", // LINE SEPARATOR
 165     "\u2029", // PARAGRAPH SEPARATOR
 166     "\u202F", // NARROW NO-BREAK SPACE
 167     "\u205F", // MEDIUM MATHEMATICAL SPACE
 168     "\u3000", // IDEOGRAPHIC SPACE
 169   ],
 170
 171   // The middle dot is used as a separator for foreign names in Japanese.
 172   MIDDLE_DOT: [
 173     "\u30FB", // KATAKANA MIDDLE DOT
 174     "\u00B7", // A (common?) typo for "KATAKANA MIDDLE DOT"
 175   ],
 176
 177   // The Unicode range is based on Wiki:
 178   // https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
 179   // https://en.wikipedia.org/wiki/Hangul
 180   // https://en.wikipedia.org/wiki/Japanese_writing_system
 181   CJK_RANGE: [
 182     "\u1100-\u11FF", // Hangul Jamo
 183     "\u3040-\u309F", // Hiragana
 184     "\u30A0-\u30FF", // Katakana
 185     "\u3105-\u312C", // Bopomofo
 186     "\u3130-\u318F", // Hangul Compatibility Jamo
 187     "\u31F0-\u31FF", // Katakana Phonetic Extensions
 188     "\u3200-\u32FF", // Enclosed CJK Letters and Months
 189     "\u3400-\u4DBF", // CJK unified ideographs Extension A
 190     "\u4E00-\u9FFF", // CJK Unified Ideographs
 191     "\uA960-\uA97F", // Hangul Jamo Extended-A
 192     "\uAC00-\uD7AF", // Hangul Syllables
 193     "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
 194     "\uFF00-\uFFEF", // Halfwidth and Fullwidth Forms
 195   ],
 196
 197   HANGUL_RANGE: [
 198     "\u1100-\u11FF", // Hangul Jamo
 199     "\u3130-\u318F", // Hangul Compatibility Jamo
 200     "\uA960-\uA97F", // Hangul Jamo Extended-A
 201     "\uAC00-\uD7AF", // Hangul Syllables
 202     "\uD7B0-\uD7FF", // Hangul Jamo Extended-B
 203   ],
 204
 205   _dataLoaded: false,
 206
 207   // Returns true if |set| contains |token|, modulo a final period.
 208   _containsString(set, token) {
 209     let target = token.replace(/\.$/, "").toLowerCase();
 210     return set.includes(target);
 211   },
 212
 213   // Removes common name prefixes from |name_tokens|.
 214   _stripPrefixes(nameTokens) {
 215     for (let i in nameTokens) {
 216       if (!this._containsString(this.NAME_PREFIXES, nameTokens[i])) {
 217         return nameTokens.slice(i);
 218       }
 219     }
 220     return [];
 221   },
 222
 223   // Removes common name suffixes from |name_tokens|.
 224   _stripSuffixes(nameTokens) {
 225     for (let i = nameTokens.length - 1; i >= 0; i--) {
 226       if (!this._containsString(this.NAME_SUFFIXES, nameTokens[i])) {
 227         return nameTokens.slice(0, i + 1);
 228       }
 229     }
 230     return [];
 231   },
 232
 233   _isCJKName(name) {
 234     // The name is considered to be a CJK name if it is only CJK characters,
 235     // spaces, and "middle dot" separators, with at least one CJK character, and
 236     // no more than 2 words.
 237     //
 238     // Chinese and Japanese names are usually spelled out using the Han
 239     // characters (logographs), which constitute the "CJK Unified Ideographs"
 240     // block in Unicode, also referred to as Unihan. Korean names are usually
 241     // spelled out in the Korean alphabet (Hangul), although they do have a Han
 242     // equivalent as well.
 243
 244     if (!name) {
 245       return false;
 246     }
 247
 248     let previousWasCJK = false;
 249     let wordCount = 0;
 250
 251     for (let c of name) {
 252       let isMiddleDot = this.MIDDLE_DOT.includes(c);
 253       let isCJK = !isMiddleDot && this.reCJK.test(c);
 254       if (!isCJK && !isMiddleDot && !this.WHITESPACE.includes(c)) {
 255         return false;
 256       }
 257       if (isCJK && !previousWasCJK) {
 258         wordCount++;
 259       }
 260       previousWasCJK = isCJK;
 261     }
 262
 263     return wordCount > 0 && wordCount < 3;
 264   },
 265
 266   // Tries to split a Chinese, Japanese, or Korean name into its given name &
 267   // surname parts. If splitting did not work for whatever reason, returns null.
 268   _splitCJKName(nameTokens) {
 269     // The convention for CJK languages is to put the surname (last name) first,
 270     // and the given name (first name) second. In a continuous text, there is
 271     // normally no space between the two parts of the name. When entering their
 272     // name into a field, though, some people add a space to disambiguate. CJK
 273     // names (almost) never have a middle name.
 274
 275     let reHangulName = new RegExp(
 276       "^[" + this.HANGUL_RANGE.join("") + this.WHITESPACE.join("") + "]+$",
 277       "u"
 278     );
 279     let nameParts = {
 280       given: "",
 281       middle: "",
 282       family: "",
 283     };
 284
 285     if (nameTokens.length == 1) {
 286       // There is no space between the surname and given name. Try to infer
 287       // where to separate between the two. Most Chinese and Korean surnames
 288       // have only one character, but there are a few that have 2. If the name
 289       // does not start with a surname from a known list, default to one
 290       // character.
 291       let name = nameTokens[0];
 292       let isKorean = reHangulName.test(name);
 293       let surnameLength = 0;
 294
 295       // 4-character Korean names are more likely to be 2/2 than 1/3, so use
 296       // the full list of Korean 2-char surnames. (instead of only the common
 297       // ones)
 298       let multiCharSurnames =
 299         isKorean && name.length > 3
 300           ? this.KOREAN_MULTI_CHAR_SURNAMES
 301           : this.COMMON_CJK_MULTI_CHAR_SURNAMES;
 302
 303       // Default to 1 character if the surname is not in the list.
 304       surnameLength = multiCharSurnames.some(surname =>
 305         name.startsWith(surname)
 306       )
 307         ? 2
 308         : 1;
 309
 310       nameParts.family = name.substr(0, surnameLength);
 311       nameParts.given = name.substr(surnameLength);
 312     } else if (nameTokens.length == 2) {
 313       // The user entered a space between the two name parts. This makes our job
 314       // easier. Family name first, given name second.
 315       nameParts.family = nameTokens[0];
 316       nameParts.given = nameTokens[1];
 317     } else {
 318       return null;
 319     }
 320
 321     return nameParts;
 322   },
 323
 324   init() {
 325     if (this._dataLoaded) {
 326       return;
 327     }
 328     this._dataLoaded = true;
 329
 330     this.reCJK = new RegExp("[" + this.CJK_RANGE.join("") + "]", "u");
 331   },
 332
 333   splitName(name) {
 334     let nameParts = {
 335       given: "",
 336       middle: "",
 337       family: "",
 338     };
 339
 340     if (!name) {
 341       return nameParts;
 342     }
 343
 344     let nameTokens = name.trim().split(/[ ,\u3000\u30FB\u00B7]+/);
 345     nameTokens = this._stripPrefixes(nameTokens);
 346
 347     if (this._isCJKName(name)) {
 348       let parts = this._splitCJKName(nameTokens);
 349       if (parts) {
 350         return parts;
 351       }
 352     }
 353
 354     // Don't assume "Ma" is a suffix in John Ma.
 355     if (nameTokens.length > 2) {
 356       nameTokens = this._stripSuffixes(nameTokens);
 357     }
 358
 359     if (!nameTokens.length) {
 360       // Bad things have happened; just assume the whole thing is a given name.
 361       nameParts.given = name;
 362       return nameParts;
 363     }
 364
 365     // Only one token, assume given name.
 366     if (nameTokens.length == 1) {
 367       nameParts.given = nameTokens[0];
 368       return nameParts;
 369     }
 370
 371     // 2 or more tokens. Grab the family, which is the last word plus any
 372     // recognizable family prefixes.
 373     let familyTokens = [nameTokens.pop()];
 374     while (nameTokens.length) {
 375       let lastToken = nameTokens[nameTokens.length - 1];
 376       if (!this._containsString(this.FAMILY_NAME_PREFIXES, lastToken)) {
 377         break;
 378       }
 379       familyTokens.unshift(lastToken);
 380       nameTokens.pop();
 381     }
 382     nameParts.family = familyTokens.join(" ");
 383
 384     // Take the last remaining token as the middle name (if there are at least 2
 385     // tokens).
 386     if (nameTokens.length >= 2) {
 387       nameParts.middle = nameTokens.pop();
 388     }
 389
 390     // Remainder is given name.
 391     nameParts.given = nameTokens.join(" ");
 392
 393     return nameParts;
 394   },
 395
 396   joinNameParts({ given, middle, family }) {
 397     if (this._isCJKName(given) && this._isCJKName(family) && !middle) {
 398       return family + given;
 399     }
 400     return [given, middle, family]
 401       .filter(part => part && part.length)
 402       .join(" ");
 403   },
 404 };
 405
 406 FormAutofillNameUtils.init();