netwerk/test/unit/test_idn_urls.js

   1 // Test algorithm for unicode display of IDNA URL (bug 722299)
   2
   3 "use strict";
   4
   5 const testcases = [
   6   //  Original             Punycode or         Expected UTF-8
   7   //    URL              normalized form
   8
   9   // Latin script
  10   ["cuillère", "xn--cuillre-6xa", true],
  11
  12   // repeated non-spacing marks
  13   ["gruz̀̀ere", "xn--gruzere-ogea", false],
  14
  15   // non-XID character
  16   ["I♥NY", "xn--iny-zx5a", false],
  17
  18   /*
  19   Behaviour of this test changed in IDNA2008, replacing the non-XID
  20   character with U+FFFD replacement character - when all platforms use
  21   IDNA2008 it can be uncommented and the punycode URL changed to
  22    "xn--mgbl3eb85703a"
  23
  24     // new non-XID character in Unicode 6.3
  25     ["حلا\u061cل", "xn--bgbvr6gc",                    false],
  26 */
  27
  28   // U+30FB KATAKANA MIDDLE DOT is excluded from non-XID characters (bug 857490)
  29   ["乾燥肌・石けん", "xn--08j4gylj12hz80b0uhfup", true],
  30
  31   // Cyrillic alone
  32   ["толсто́й", "xn--lsa83dealbred", true],
  33
  34   // Mixed script Cyrillic/Latin
  35   ["толсто́й-in-Russian", "xn---in-russian-1jg071b0a8bb4cpd", false],
  36
  37   // Mixed script Latin/Cyrillic
  38   ["war-and-миръ", "xn--war-and--b9g3b7b3h", false],
  39
  40   // Cherokee (Restricted script)
  41   ["ᏣᎳᎩ", "xn--f9dt7l", false],
  42
  43   // Yi (former Aspirational script, now Restricted per Unicode 10.0 update to UAX 31)
  44   ["ꆈꌠꁱꂷ", "xn--4o7a6e1x64c", false],
  45
  46   // Greek alone
  47   ["πλάτων", "xn--hxa3ahjw4a", true],
  48
  49   // Mixed script Greek/Latin
  50   ["πλάτωνicrelationship", "xn--icrelationship-96j4t9a3cwe2e", false],
  51
  52   // Mixed script Latin/Greek
  53   ["spaceὈδύσσεια", "xn--space-h9dui0b0ga2j1562b", false],
  54
  55   // Devanagari alone
  56   ["मराठी", "xn--d2b1ag0dl", true],
  57
  58   // Devanagari with Armenian
  59   ["मराठीՀայաստան", "xn--y9aaa1d0ai1cq964f8dwa2o1a", false],
  60
  61   // Devanagari with common
  62   ["मराठी123", "xn--123-mhh3em2hra", true],
  63
  64   // Common with Devanagari
  65   ["123मराठी", "xn--123-phh3em2hra", true],
  66
  67   // Latin with Han
  68   ["chairman毛", "xn--chairman-k65r", true],
  69
  70   // Han with Latin
  71   ["山葵sauce", "xn--sauce-6j9ii40v", true],
  72
  73   // Latin with Han, Hiragana and Katakana
  74   ["van語ではドイ", "xn--van-ub4bpb6w0in486d", true],
  75
  76   // Latin with Han, Katakana and Hiragana
  77   ["van語ドイでは", "xn--van-ub4bpb4w0ip486d", true],
  78
  79   // Latin with Hiragana, Han and Katakana
  80   ["vanでは語ドイ", "xn--van-ub4bpb6w0ip486d", true],
  81
  82   // Latin with Hiragana, Katakana and Han
  83   ["vanではドイ語", "xn--van-ub4bpb6w0ir486d", true],
  84
  85   // Latin with Katakana, Han and Hiragana
  86   ["vanドイ語では", "xn--van-ub4bpb4w0ir486d", true],
  87
  88   // Latin with Katakana, Hiragana and Han
  89   ["vanドイでは語", "xn--van-ub4bpb4w0it486d", true],
  90
  91   // Han with Latin, Hiragana and Katakana
  92   ["語vanではドイ", "xn--van-ub4bpb6w0ik486d", true],
  93
  94   // Han with Latin, Katakana and Hiragana
  95   ["語vanドイでは", "xn--van-ub4bpb4w0im486d", true],
  96
  97   // Han with Hiragana, Latin and Katakana
  98   ["語ではvanドイ", "xn--van-rb4bpb9w0ik486d", true],
  99
 100   // Han with Hiragana, Katakana and Latin
 101   ["語ではドイvan", "xn--van-rb4bpb6w0in486d", true],
 102
 103   // Han with Katakana, Latin and Hiragana
 104   ["語ドイvanでは", "xn--van-ub4bpb1w0ip486d", true],
 105
 106   // Han with Katakana, Hiragana and Latin
 107   ["語ドイではvan", "xn--van-rb4bpb4w0ip486d", true],
 108
 109   // Hiragana with Latin, Han and Katakana
 110   ["イツvan語ではド", "xn--van-ub4bpb1wvhsbx330n", true],
 111
 112   // Hiragana with Latin, Katakana and Han
 113   ["ではvanドイ語", "xn--van-rb4bpb9w0ir486d", true],
 114
 115   // Hiragana with Han, Latin and Katakana
 116   ["では語vanドイ", "xn--van-rb4bpb9w0im486d", true],
 117
 118   // Hiragana with Han, Katakana and Latin
 119   ["では語ドイvan", "xn--van-rb4bpb6w0ip486d", true],
 120
 121   // Hiragana with Katakana, Latin and Han
 122   ["ではドイvan語", "xn--van-rb4bpb6w0iu486d", true],
 123
 124   // Hiragana with Katakana, Han and Latin
 125   ["ではドイ語van", "xn--van-rb4bpb6w0ir486d", true],
 126
 127   // Katakana with Latin, Han and Hiragana
 128   ["ドイvan語では", "xn--van-ub4bpb1w0iu486d", true],
 129
 130   // Katakana with Latin, Hiragana and Han
 131   ["ドイvanでは語", "xn--van-ub4bpb1w0iw486d", true],
 132
 133   // Katakana with Han, Latin and Hiragana
 134   ["ドイ語vanでは", "xn--van-ub4bpb1w0ir486d", true],
 135
 136   // Katakana with Han, Hiragana and Latin
 137   ["ドイ語ではvan", "xn--van-rb4bpb4w0ir486d", true],
 138
 139   // Katakana with Hiragana, Latin and Han
 140   ["ドイではvan語", "xn--van-rb4bpb4w0iw486d", true],
 141
 142   // Katakana with Hiragana, Han and Latin
 143   ["ドイでは語van", "xn--van-rb4bpb4w0it486d", true],
 144
 145   // Han with common
 146   ["中国123", "xn--123-u68dy61b", true],
 147
 148   // common with Han
 149   ["123中国", "xn--123-x68dy61b", true],
 150
 151   // Characters that normalize to permitted characters
 152   //  (also tests Plane 1 supplementary characters)
 153   ["super𝟖", "super8", true],
 154
 155   // Han from Plane 2
 156   ["𠀀𠀁𠀂", "xn--j50icd", true],
 157
 158   // Han from Plane 2 with js (UTF-16) escapes
 159   ["\uD840\uDC00\uD840\uDC01\uD840\uDC02", "xn--j50icd", true],
 160
 161   // Same with a lone high surrogate at the end
 162   // Throws due to unpaired surrogate
 163   //  ["\uD840\uDC00\uD840\uDC01\uD840", "xn--zn7c0336bda", false],
 164
 165   // Latin text and Bengali digits
 166   ["super৪", "xn--super-k2l", false],
 167
 168   // Bengali digits and Latin text
 169   ["৫ab", "xn--ab-x5f", false],
 170
 171   // Bengali text and Latin digits
 172   ["অঙ্কুর8", "xn--8-70d2cp0j6dtd", true],
 173
 174   // Latin digits and Bengali text
 175   ["5াব", "xn--5-h3d7c", true],
 176
 177   // Mixed numbering systems
 178   // Throws due to bidi rule violation
 179   // ["٢٠۰٠", "xn--8hbae38c", false],
 180
 181   // Traditional Chinese
 182   ["萬城", "xn--uis754h", true],
 183
 184   // Simplified Chinese
 185   ["万城", "xn--chq31v", true],
 186
 187   // Simplified-only and Traditional-only Chinese in the same label
 188   ["万萬城", "xn--chq31vsl1b", true],
 189
 190   // Traditional-only and Simplified-only Chinese in the same label
 191   ["萬万城", "xn--chq31vrl1b", true],
 192
 193   // Han and Latin and Bopomofo
 194   ["注音符号bopomofoㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm1637i876cuw0brk5f", true],
 195
 196   // Han, bopomofo, Latin
 197   // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph,
 198   // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ".
 199   ["注音符号ㄅbopomofo", "xn--bopomofo-8i5gx891aylvccz9asi4e", true],
 200
 201   // Latin, Han, Bopomofo
 202   ["bopomofo注音符号ㄅㄆㄇㄈ", "xn--bopomofo-hj5gkalm9637i876cuw0brk5f", true],
 203
 204   // Latin, Bopomofo, Han
 205   ["bopomofoㄅㄆㄇㄈ注音符号", "xn--bopomofo-hj5gkalm3737i876cuw0brk5f", true],
 206
 207   // Bopomofo, Han, Latin
 208   ["ㄅㄆㄇㄈ注音符号bopomofo", "xn--bopomofo-8i5gkalm3737i876cuw0brk5f", true],
 209
 210   // Bopomofo, Latin, Han
 211   // Bug 1885096: Since the last character of "ㄅㄆㄇㄈ" is a CJK Ideograph,
 212   // just use the first character "ㄅ" from the sequence "ㄅㄆㄇㄈ".
 213   ["ㄅbopomofo注音符号", "xn--bopomofo-8i5g6891aylvccz9asi4e", true],
 214
 215   // Han, bopomofo and katakana
 216   ["注音符号ㄅㄆㄇㄈボポモフォ", "xn--jckteuaez1shij0450gylvccz9asi4e", false],
 217
 218   // Han, katakana, bopomofo
 219   ["注音符号ボポモフォㄅㄆㄇㄈ", "xn--jckteuaez6shij5350gylvccz9asi4e", false],
 220
 221   // bopomofo, han, katakana
 222   ["ㄅㄆㄇㄈ注音符号ボポモフォ", "xn--jckteuaez1shij4450gylvccz9asi4e", false],
 223
 224   // bopomofo, katakana, han
 225   ["ㄅㄆㄇㄈボポモフォ注音符号", "xn--jckteuaez1shij9450gylvccz9asi4e", false],
 226
 227   // katakana, Han, bopomofo
 228   ["ボポモフォ注音符号ㄅㄆㄇㄈ", "xn--jckteuaez6shij0450gylvccz9asi4e", false],
 229
 230   // katakana, bopomofo, Han
 231   ["ボポモフォㄅㄆㄇㄈ注音符号", "xn--jckteuaez6shij4450gylvccz9asi4e", false],
 232
 233   // Han, Hangul and Latin
 234   ["韓한글hangul", "xn--hangul-2m5ti09k79ze", true],
 235
 236   // Han, Latin and Hangul
 237   ["韓hangul한글", "xn--hangul-2m5to09k79ze", true],
 238
 239   // Hangul, Han and Latin
 240   ["한글韓hangul", "xn--hangul-2m5th09k79ze", true],
 241
 242   // Hangul, Latin and Han
 243   ["한글hangul韓", "xn--hangul-8m5t898k79ze", true],
 244
 245   // Latin, Han and Hangul
 246   ["hangul韓한글", "xn--hangul-8m5ti09k79ze", true],
 247
 248   // Latin, Hangul and Han
 249   ["hangul한글韓", "xn--hangul-8m5th09k79ze", true],
 250
 251   // Hangul and katakana
 252   ["한글ハングル", "xn--qck1c2d4a9266lkmzb", false],
 253
 254   // Katakana and Hangul
 255   ["ハングル한글", "xn--qck1c2d4a2366lkmzb", false],
 256
 257   // Thai (also tests that node with over 63 UTF-8 octets doesn't fail)
 258   [
 259     "เครื่องทําน้ําทําน้ําแข็ง",
 260     "xn--22cdjb2fanb9fyepcbbb9dwh4a3igze4fdcd",
 261     true,
 262   ],
 263
 264   // Effect of adding valid or invalid subdomains (bug 1399540)
 265   ["䕮䕵䕶䕱.ascii", "xn--google.ascii", true],
 266   ["ascii.䕮䕵䕶䕱", "ascii.xn--google", true],
 267   ["中国123.䕮䕵䕶䕱", "xn--123-u68dy61b.xn--google", true],
 268   ["䕮䕵䕶䕱.中国123", "xn--google.xn--123-u68dy61b", true],
 269   // Throw due to bogus Punycode
 270   // [
 271   //   "xn--accountlogin.䕮䕵䕶䕱",
 272   //   "xn--accountlogin.xn--google",
 273   //   true,
 274   // ],
 275   // [
 276   //   "䕮䕵䕶䕱.xn--accountlogin",
 277   //   "xn--google.xn--accountlogin",
 278   //   true,
 279   // ],
 280
 281   // Arabic diacritic not allowed in Latin text (bug 1370497)
 282   ["goo\u0650gle", "xn--google-yri", false],
 283   // ...but Arabic diacritics are allowed on Arabic text
 284   ["العَرَبِي", "xn--mgbc0a5a6cxbzabt", true],
 285
 286   // Hebrew diacritic also not allowed in Latin text (bug 1404349)
 287   ["goo\u05b4gle", "xn--google-rvh", false],
 288
 289   // Accents above dotless-i are not allowed
 290   ["na\u0131\u0308ve", "xn--nave-mza04z", false],
 291   ["d\u0131\u0302ner", "xn--dner-lza40z", false],
 292   // but the corresponding accented-i (based on dotted i) is OK
 293   ["na\u00efve.com", "xn--nave-6pa.com", true],
 294   ["d\u00eener.com", "xn--dner-0pa.com", true],
 295 ];
 296
 297 function run_test() {
 298   var idnService = Cc["@mozilla.org/network/idn-service;1"].getService(
 299     Ci.nsIIDNService
 300   );
 301
 302   for (var j = 0; j < testcases.length; ++j) {
 303     var test = testcases[j];
 304     var URL = test[0] + ".com";
 305     var punycodeURL = test[1] + ".com";
 306     var expectedUnicode = test[2];
 307
 308     var result;
 309     try {
 310       result = idnService.convertToDisplayIDN(URL);
 311     } catch (e) {
 312       result = ".com";
 313     }
 314     if (
 315       punycodeURL.substr(0, 4) == "xn--" ||
 316       punycodeURL.indexOf(".xn--") > 0
 317     ) {
 318       // test convertToDisplayIDN with a Unicode URL and with a
 319       //  Punycode URL if we have one
 320       Assert.equal(
 321         escape(result),
 322         expectedUnicode ? escape(URL) : escape(punycodeURL)
 323       );
 324
 325       result = idnService.convertToDisplayIDN(punycodeURL);
 326       Assert.equal(
 327         escape(result),
 328         expectedUnicode ? escape(URL) : escape(punycodeURL)
 329       );
 330     } else {
 331       // The "punycode" URL isn't punycode. This happens in testcases
 332       // where the Unicode URL has become normalized to an ASCII URL,
 333       // so, even though expectedUnicode is true, the expected result
 334       // is equal to punycodeURL
 335       Assert.equal(escape(result), escape(punycodeURL));
 336     }
 337   }
 338 }