pkgs/servers/dict/wiktionary/wiktionary2dict.py

   1 # Adapted to produce DICT-compatible files by Petr Rockai in 2012
   2 # Based on code from wiktiondict by Greg Hewgill
   3 import re
   4 import sys
   5 import os
   6 import textwrap
   7 import time
   8 import xml.sax
   9
  10 class Text:
  11     def __init__(self, s):
  12         self.s = s
  13     def process(self):
  14         return s
  15
  16 class TemplateCall:
  17     def __init__(self):
  18         pass
  19     def process(self):
  20         pass
  21
  22 class Template:
  23     def __init__(self):
  24         self.parts = []
  25     def append(self, part):
  26         self.parts.append(part)
  27     def process(self):
  28         return ''.join(x.process() for x in self.parts)
  29
  30 class Whitespace:
  31     def __init__(self, s):
  32         self.s = s
  33
  34 class OpenDouble: pass
  35 class OpenTriple: pass
  36 class CloseDouble: pass
  37 class CloseTriple: pass
  38
  39 class Equals:
  40     def __str__(self):
  41         return "="
  42
  43 class Delimiter:
  44     def __init__(self, c):
  45         self.c = c
  46     def __str__(self):
  47         return self.c
  48
  49 def Tokenise(s):
  50     s = str(s)
  51     stack = []
  52     last = 0
  53     i = 0
  54     while i < len(s):
  55         if s[i] == '{' and i+1 < len(s) and s[i+1] == '{':
  56             if i > last:
  57                 yield s[last:i]
  58             if i+2 < len(s) and s[i+2] == '{':
  59                 yield OpenTriple()
  60                 stack.append(3)
  61                 i += 3
  62             else:
  63                 yield OpenDouble()
  64                 stack.append(2)
  65                 i += 2
  66             last = i
  67         elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}':
  68             if i > last:
  69                 yield s[last:i]
  70             if len(stack) == 0:
  71                 yield "}}"
  72                 i += 2
  73             elif stack[-1] == 2:
  74                 yield CloseDouble()
  75                 i += 2
  76                 stack.pop()
  77             elif i+2 < len(s) and s[i+2] == '}':
  78                 yield CloseTriple()
  79                 i += 3
  80                 stack.pop()
  81             else:
  82                 raise SyntaxError()
  83             last = i
  84         elif s[i] == ':' or s[i] == '|':
  85             if i > last:
  86                 yield s[last:i]
  87             yield Delimiter(s[i])
  88             i += 1
  89             last = i
  90         elif s[i] == '=':
  91             if i > last:
  92                 yield s[last:i]
  93             yield Equals()
  94             i += 1
  95             last = i
  96         #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
  97         #    if i > last:
  98         #        yield s[last:i]
  99         #    last = i
 100         #    m = re.match(r"\s+", s[i:])
 101         #    assert m
 102         #    yield Whitespace(m.group(0))
 103         #    i += len(m.group(0))
 104         #    last = i
 105         else:
 106             i += 1
 107     if i > last:
 108         yield s[last:i]
 109
 110 def processSub(templates, tokens, args):
 111     t = next(tokens)
 112     if not isinstance(t, str):
 113         raise SyntaxError
 114     name = t
 115     t = next(tokens)
 116     default = None
 117     if isinstance(t, Delimiter) and t.c == '|':
 118         default = ""
 119         while True:
 120             t = next(tokens)
 121             if isinstance(t, str):
 122                 default += t
 123             elif isinstance(t, OpenDouble):
 124                 default += processTemplateCall(templates, tokens, args)
 125             elif isinstance(t, OpenTriple):
 126                 default += processSub(templates, tokens, args)
 127             elif isinstance(t, CloseTriple):
 128                 break
 129             else:
 130                 print("Unexpected:", t)
 131                 raise SyntaxError()
 132     if name in args:
 133         return args[name]
 134     if default is not None:
 135         return default
 136     if name == "lang":
 137         return "en"
 138     return "{{{%s}}}" % name
 139
 140 def processTemplateCall(templates, tokens, args):
 141     template = tokens.next().strip().lower()
 142     args = {}
 143     a = 1
 144     t = next(tokens)
 145     while True:
 146         if isinstance(t, Delimiter):
 147             name = str(a)
 148             arg = ""
 149             while True:
 150                 t = next(tokens)
 151                 if isinstance(t, str):
 152                     arg += t
 153                 elif isinstance(t, OpenDouble):
 154                     arg += processTemplateCall(templates, tokens, args)
 155                 elif isinstance(t, OpenTriple):
 156                     arg += processSub(templates, tokens, args)
 157                 elif isinstance(t, Delimiter) and t.c != '|':
 158                     arg += str(t)
 159                 else:
 160                     break
 161             if isinstance(t, Equals):
 162                 name = arg.strip()
 163                 arg = ""
 164                 while True:
 165                     t = next(tokens)
 166                     if isinstance(t, (str, Equals)):
 167                         arg += str(t)
 168                     elif isinstance(t, OpenDouble):
 169                         arg += processTemplateCall(templates, tokens, args)
 170                     elif isinstance(t, OpenTriple):
 171                         arg += processSub(templates, tokens, args)
 172                     elif isinstance(t, Delimiter) and t.c != '|':
 173                         arg += str(t)
 174                     else:
 175                         break
 176                 arg = arg.strip()
 177             else:
 178                 a += 1
 179             args[name] = arg
 180         elif isinstance(t, CloseDouble):
 181             break
 182         else:
 183             print("Unexpected:", t)
 184             raise SyntaxError
 185     #print template, args
 186     if template[0] == '#':
 187         if template == "#if":
 188             if args['1'].strip():
 189                 return args['2']
 190             elif '3' in args:
 191                 return args['3']
 192             else:
 193                 return ""
 194         elif template == "#ifeq":
 195             if args['1'].strip() == args['2'].strip():
 196                 return args['3']
 197             elif '4' in args:
 198                 return args['4']
 199             else:
 200                 return ""
 201         elif template == "#ifexist":
 202             return ""
 203         elif template == "#switch":
 204             sw = args['1'].strip()
 205             if sw in args:
 206                 return args[sw]
 207             else:
 208                 return ""
 209         else:
 210             print("Unknown ParserFunction:", template)
 211             sys.exit(1)
 212     if template not in templates:
 213         return "{{%s}}" % template
 214     return process(templates, templates[template], args)
 215
 216 def process(templates, s, args = {}):
 217     s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s)
 218     s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s)
 219     assert "<onlyinclude>" not in s
 220     #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
 221     s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s)
 222     r = ""
 223     #print list(Tokenise(s))
 224     tokens = Tokenise(s)
 225     try:
 226         while True:
 227             t = next(tokens)
 228             if isinstance(t, OpenDouble):
 229                 r += processTemplateCall(templates, tokens, args)
 230             elif isinstance(t, OpenTriple):
 231                 r += processSub(templates, tokens, args)
 232             else:
 233                 r += str(t)
 234     except StopIteration:
 235         pass
 236     return r
 237
 238 def test():
 239     templates = {
 240         'lb': "{{",
 241         'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
 242         't': "start-{{{1|pqr}}}-end",
 243         't0': "start-{{{1}}}-end",
 244         't1': "start{{{1}}}end<noinclude>moo</noinclude>",
 245         't2a1': "{{t2demo|a|{{{1}}}}}",
 246         't2a2': "{{t2demo|a|2={{{1}}}}}",
 247         't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
 248         't5': "{{t2demo|{{{a}}}=b}}",
 249         't6': "t2demo|a",
 250     }
 251     def t(text, expected):
 252         print("text:", text)
 253         s = process(templates, text)
 254         if s != expected:
 255             print("got:", s)
 256             print("expected:", expected)
 257             sys.exit(1)
 258     t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
 259     t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
 260     t("{{t0|a}}", "start-a-end")
 261     t("{{t0| }}", "start- -end")
 262     t("{{t0|}}", "start--end")
 263     t("{{t0}}", "start-{{{1}}}-end")
 264     t("{{t0|     }}", "start-     -end")
 265     t("{{t0|\n}}", "start-\n-end")
 266     t("{{t0|1=     }}", "start--end")
 267     t("{{t0|1=\n}}", "start--end")
 268     t("{{T}}", "start-pqr-end")
 269     t("{{T|}}", "start--end")
 270     t("{{T|abc}}", "start-abc-end")
 271     t("{{T|abc|def}}", "start-abc-end")
 272     t("{{T|1=abc|1=def}}", "start-def-end")
 273     t("{{T|abc|1=def}}", "start-def-end")
 274     t("{{T|1=abc|def}}", "start-def-end")
 275     t("{{T|{{T}}}}", "start-start-pqr-end-end")
 276     t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
 277     t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
 278     t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
 279     t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
 280     t("{{T|a=b}}", "start-pqr-end")
 281     t("{{T|1=a=b}}", "start-a=b-end")
 282     #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
 283     #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
 284     #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
 285     #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
 286     #t("{{ {{t6}} }}", "{{ t2demo|a }}")
 287     t("{{t|[[a|b]]}}", "start-b-end")
 288     t("{{t|[[a|b]] }}", "start-b -end")
 289
 290 Parts = {
 291     # Standard POS headers
 292     'noun': "n.",
 293     'Noun': "n.",
 294     'Noun 1': "n.",
 295     'Noun 2': "n.",
 296     'Verb': "v.",
 297     'Adjective': "adj.",
 298     'Adverb': "adv.",
 299     'Pronoun': "pron.",
 300     'Conjunction': "conj.",
 301     'Interjection': "interj.",
 302     'Preposition': "prep.",
 303     'Proper noun': "n.p.",
 304     'Proper Noun': "n.p.",
 305     'Article': "art.",
 306
 307     # Standard non-POS level 3 headers
 308     '{{acronym}}': "acr.",
 309     'Acronym': "acr.",
 310     '{{abbreviation}}': "abbr.",
 311     '[[Abbreviation]]': "abbr.",
 312     'Abbreviation': "abbr.",
 313     '[[initialism]]': "init.",
 314     '{{initialism}}': "init.",
 315     'Initialism': "init.",
 316     'Contraction': "cont.",
 317     'Prefix': "prefix",
 318     'Suffix': "suffix",
 319     'Symbol': "sym.",
 320     'Letter': "letter",
 321     'Idiom': "idiom",
 322     'Idioms': "idiom",
 323     'Phrase': "phrase",
 324
 325     # Debated POS level 3 headers
 326     'Number': "num.",
 327     'Numeral': "num.",
 328     'Cardinal number': "num.",
 329     'Ordinal number': "num.",
 330     'Cardinal numeral': "num.",
 331     'Ordinal numeral': "num.",
 332
 333     # Other headers in use
 334     'Personal pronoun': "pers.pron.",
 335     'Adjective/Adverb': "adj./adv.",
 336     'Proper adjective': "prop.adj.",
 337     'Determiner': "det.",
 338     'Demonstrative determiner': "dem.det.",
 339     'Clitic': "clitic",
 340     'Infix': "infix",
 341     'Counter': "counter",
 342     'Kanji': None,
 343     'Kanji reading': None,
 344     'Hiragana letter': None,
 345     'Katakana letter': None,
 346     'Pinyin': None,
 347     'Han character': None,
 348     'Hanzi': None,
 349     'Hanja': None,
 350     'Proverb': "prov.",
 351     'Expression': None,
 352     'Adjectival noun': None,
 353     'Quasi-adjective': None,
 354     'Particle': "part.",
 355     'Infinitive particle': "part.",
 356     'Possessive adjective': "poss.adj.",
 357     'Verbal prefix': "v.p.",
 358     'Postposition': "post.",
 359     'Prepositional article': "prep.art.",
 360     'Phrasal verb': "phr.v.",
 361     'Participle': "participle",
 362     'Interrogative auxiliary verb': "int.aux.v.",
 363     'Pronominal adverb': "pron.adv.",
 364     'Adnominal': "adn.",
 365     'Abstract pronoun': "abs.pron.",
 366     'Conjunction particle': None,
 367     'Root': "root",
 368
 369     # Non-standard, deprecated headers
 370     'Noun form': "n.",
 371     'Verb form': "v.",
 372     'Adjective form': "adj.form.",
 373     'Nominal phrase': "nom.phr.",
 374     'Noun phrase': "n. phrase",
 375     'Verb phrase': "v. phrase",
 376     'Transitive verb': "v.t.",
 377     'Intransitive verb': "v.i.",
 378     'Reflexive verb': "v.r.",
 379     'Cmavo': None,
 380     'Romaji': "rom.",
 381     'Hiragana': None,
 382     'Furigana': None,
 383     'Compounds': None,
 384
 385     # Other headers seen
 386     'Alternative forms': None,
 387     'Alternative spellings': None,
 388     'Anagrams': None,
 389     'Antonym': None,
 390     'Antonyms': None,
 391     'Conjugation': None,
 392     'Declension': None,
 393     'Declension and pronunciations': None,
 394     'Definite Article': "def.art.",
 395     'Definite article': "def.art.",
 396     'Demonstrative pronoun': "dem.pron.",
 397     'Derivation': None,
 398     'Derived expression': None,
 399     'Derived expressions': None,
 400     'Derived forms': None,
 401     'Derived phrases': None,
 402     'Derived terms': None,
 403     'Derived, Related terms': None,
 404     'Descendants': None,
 405     #'Etymology': None,
 406     #'Etymology 1': None,
 407     #'Etymology 2': None,
 408     #'Etymology 3': None,
 409     #'Etymology 4': None,
 410     #'Etymology 5': None,
 411     'Examples': None,
 412     'External links': None,
 413     '[[Gismu]]': None,
 414     'Gismu': None,
 415     'Homonyms': None,
 416     'Homophones': None,
 417     'Hyphenation': None,
 418     'Indefinite article': "art.",
 419     'Indefinite pronoun': "ind.pron.",
 420     'Indefinite Pronoun': "ind.pron.",
 421     'Indetermined pronoun': "ind.pron.",
 422     'Interrogative conjunction': "int.conj.",
 423     'Interrogative determiner': "int.det.",
 424     'Interrogative particle': "int.part.",
 425     'Interrogative pronoun': "int.pron.",
 426     'Legal expression': "legal",
 427     'Mass noun': "n.",
 428     'Miscellaneous': None,
 429     'Mutations': None,
 430     'Noun and verb': "n/v.",
 431     'Other language': None,
 432     'Pinyin syllable': None,
 433     'Possessive determiner': "poss.det.",
 434     'Possessive pronoun': "poss.pron.",
 435     'Prepositional phrase': "prep.phr.",
 436     'Prepositional Pronoun': "prep.pron.",
 437     'Pronunciation': None,
 438     'Pronunciation 1': None,
 439     'Pronunciation 2': None,
 440     'Quotations': None,
 441     'References': None,
 442     'Reflexive pronoun': "refl.pron.",
 443     'Related expressions': None,
 444     'Related terms': None,
 445     'Related words': None,
 446     'Relative pronoun': "rel.pron.",
 447     'Saying': "saying",
 448     'See also': None,
 449     'Shorthand': None,
 450     '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
 451     'Sister projects': None,
 452     'Spelling note': None,
 453     'Synonyms': None,
 454     'Translation': None,
 455     'Translations': None,
 456     'Translations to be checked': None,
 457     'Transliteration': None,
 458     'Trivia': None,
 459     'Usage': None,
 460     'Usage in English': None,
 461     'Usage notes': None,
 462     'Verbal noun': "v.n.",
 463 }
 464 PartsUsed = {}
 465 for p in list(Parts.keys()):
 466     PartsUsed[p] = 0
 467
 468 def encode(s):
 469     r = e(s)
 470     assert r[1] == len(s)
 471     return r[0]
 472
 473 def dowikilink(m):
 474     a = m.group(1).split("|")
 475     if len(a) > 1:
 476         link = a[1]
 477     else:
 478         link = a[0]
 479     if ':' in link:
 480         link = ""
 481     return link
 482
 483 seentemplates = {}
 484 def dotemplate(m):
 485     aa = m.group(1).split("|")
 486     args = {}
 487     n = 0
 488     for a in aa:
 489         am = re.match(r"(.*?)(=(.*))?", a)
 490         if am:
 491             args[am.group(1)] = am.group(3)
 492         else:
 493             n += 1
 494             args[n] = am.group(1)
 495
 496     #if aa[0] in seentemplates:
 497     #    seentemplates[aa[0]] += 1
 498     #else:
 499     #    seentemplates[aa[0]] = 1
 500     #    print len(seentemplates), aa[0]
 501     #print aa[0]
 502
 503     #if aa[0] not in Templates:
 504     #    return "(unknown template %s)" % aa[0]
 505     #body = Templates[aa[0]]
 506     #body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
 507     #assert "<onlyinclude>" not in body
 508     ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
 509     #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
 510     #def dotemplatearg(m):
 511     #    ta = m.group(1).split("|")
 512     #    if ta[0] in args:
 513     #        return args[ta[0]]
 514     #    elif len(ta) > 1:
 515     #        return ta[1]
 516     #    else:
 517     #        return "{{{%s}}}" % ta[0]
 518     #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
 519     #return dewiki(body)
 520
 521 def doparserfunction(m):
 522     a = m.group(2).split("|")
 523     if m.group(1) == "ifeq":
 524         if a[0] == a[1]:
 525             return a[2]
 526         elif len(a) >= 4:
 527             return a[3]
 528     return ""
 529
 530 def dewiki(body, indent = 0):
 531     # process in this order:
 532     #   {{{ }}}
 533     #   <> <>
 534     #   [[ ]]
 535     #   {{ }}
 536     #   ''' '''
 537     #   '' ''
 538     #body = wikimediatemplate.process(Templates, body)
 539     body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body)
 540     #body = re.sub(r"{{(.*?)}}", dotemplate, body)
 541     #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
 542     body = re.sub(r"'''(.*?)'''", r"\1", body)
 543     body = re.sub(r"''(.*?)''", r"\1", body)
 544     lines = body.split("\n")
 545     n = 0
 546     i = 0
 547     while i < len(lines):
 548         if len(lines[i]) > 0 and lines[i][0] == "#":
 549             if len(lines[i]) > 1 and lines[i][1] == '*':
 550                 wlines = textwrap.wrap(lines[i][2:].strip(),
 551                     initial_indent = "    * ",
 552                     subsequent_indent = "      ")
 553             elif len(lines[i]) > 1 and lines[i][1] == ':':
 554                 wlines = textwrap.wrap(lines[i][2:].strip(),
 555                     initial_indent = "        ",
 556                     subsequent_indent = "        ")
 557             else:
 558                 n += 1
 559                 wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(),
 560                     subsequent_indent = "   ")
 561         elif len(lines[i]) > 0 and lines[i][0] == "*":
 562             n = 0
 563             wlines = textwrap.wrap(lines[i][1:].strip(),
 564                 initial_indent = "* ",
 565                 subsequent_indent = "  ")
 566         else:
 567             n = 0
 568             wlines = textwrap.wrap(lines[i].strip())
 569             if len(wlines) == 0:
 570                 wlines = ['']
 571         lines[i:i+1] = wlines
 572         i += len(wlines)
 573     return ''.join("  "*(indent-1)+x+"\n" for x in lines)
 574
 575 class WikiSection:
 576     def __init__(self, heading, body):
 577         self.heading = heading
 578         self.body = body
 579         #self.lines = re.split("\n+", body.strip())
 580         #if len(self.lines) == 1 and len(self.lines[0]) == 0:
 581         #    self.lines = []
 582         self.children = []
 583     def __str__(self):
 584         return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children]))
 585     def add(self, section):
 586         self.children.append(section)
 587
 588 def parse(word, text):
 589     headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE))
 590     #print [x.group(1) for x in headings]
 591     doc = WikiSection(word, "")
 592     stack = [doc]
 593     for i, m in enumerate(headings):
 594         depth = len(m.group(1))
 595         if depth < len(stack):
 596             stack = stack[:depth]
 597         else:
 598             while depth > len(stack):
 599                 s = WikiSection(None, "")
 600                 stack[-1].add(s)
 601                 stack.append(s)
 602         if i+1 < len(headings):
 603             s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip())
 604         else:
 605             s = WikiSection(m.group(2), text[m.end(0):].strip())
 606         assert len(stack) == depth
 607         stack[-1].add(s)
 608         stack.append(s)
 609     #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
 610     #    doc = doc.children[0]
 611     return doc
 612
 613 def formatFull(word, doc):
 614     def f(depth, section):
 615         if section.heading:
 616             r = "  "*(depth-1) + section.heading + "\n\n"
 617         else:
 618             r = ""
 619         if section.body:
 620             r += dewiki(section.body, depth+1)+"\n"
 621         #r += "".join("  "*depth + x + "\n" for x in dewiki(section.body))
 622         #if len(section.lines) > 0:
 623         #    r += "\n"
 624         for c in section.children:
 625             r += f(depth+1, c)
 626         return r
 627     s = f(0, doc)
 628     s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 629     return s
 630
 631 def formatNormal(word, doc):
 632     def f(depth, posdepth, section):
 633         r = ""
 634         if depth == posdepth:
 635             if not section.heading or section.heading.startswith("Etymology"):
 636                 posdepth += 1
 637             elif section.heading in Parts:
 638                 #p = Parts[section.heading]
 639                 #if p:
 640                 #    r += "  "*(depth-1) + word + " (" + p + ")\n\n"
 641                 r += "  "*(depth-1) + section.heading + "\n\n"
 642             else:
 643                 print("Unknown part: (%s) %s" % (word, section.heading), file=errors)
 644                 return ""
 645         elif depth > posdepth:
 646             return ""
 647         elif section.heading:
 648             r += "  "*(depth-1) + section.heading + "\n\n"
 649         if section.body:
 650             r += dewiki(section.body, depth+1)+"\n"
 651         #r += "".join("  "*depth + x + "\n" for x in dewiki(section.lines))
 652         #if len(section.lines) > 0:
 653         #    r += "\n"
 654         for c in section.children:
 655             r += f(depth+1, posdepth, c)
 656         return r
 657     s = f(0, 3, doc)
 658     s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 659     return s
 660
 661 def formatBrief(word, doc):
 662     def f(depth, posdepth, section):
 663         if depth == posdepth:
 664             h = section.heading
 665             if not section.heading or section.heading.startswith("Etymology"):
 666                 posdepth += 1
 667             elif section.heading in Parts:
 668                 #h = Parts[section.heading]
 669                 #if h:
 670                 #    h = "%s (%s)" % (word, h)
 671                 pass
 672             stack.append([h, False])
 673         elif depth > 0:
 674             stack.append([section.heading, False])
 675         else:
 676             stack.append(["%h " + section.heading, False])
 677         r = ""
 678         #if section.heading:
 679         #    r += "  "*(depth-1) + section.heading + "\n"
 680         body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#')
 681         if len(body) > 0:
 682             for i in range(len(stack)):
 683                 if not stack[i][1]:
 684                     if stack[i][0]:
 685                         r += "  "*(i-1) + stack[i][0] + "\n"
 686                     stack[i][1] = True
 687             r += dewiki(body, depth+1)
 688         for c in section.children:
 689             r += f(depth+1, posdepth, c)
 690         stack.pop()
 691         return r
 692     stack = []
 693     s = f(0, 3, doc)
 694     s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
 695     return s
 696
 697 class WikiHandler(xml.sax.ContentHandler):
 698     def __init__(self):
 699         self.element = None
 700         self.page = None
 701         self.text = ""
 702         self.long = {}
 703     def startElement(self, name, attrs):
 704         #print "start", name, attrs
 705         self.element = name
 706     def endElement(self, name):
 707         #print "end", name
 708         if self.element == "text":
 709             if self.page:
 710                 if self.page in self.long:
 711                     print(self.page, len(self.text))
 712                     print()
 713                 self.doPage(self.page, self.text)
 714                 self.page = None
 715             self.text = ""
 716         self.element = None
 717     def characters(self, content):
 718         #print "characters", content
 719         if self.element == "title":
 720             if self.checkPage(content):
 721                 self.page = content
 722         elif self.element == "text":
 723             if self.page:
 724                 self.text += content
 725                 if len(self.text) > 100000 and self.page not in self.long:
 726                     self.long[self.page] = 1
 727     def checkPage(self, page):
 728         return False
 729     def doPage(self, page, text):
 730         pass
 731
 732 class TemplateHandler(WikiHandler):
 733     def checkPage(self, page):
 734         return page.startswith("Template:")
 735     def doPage(self, page, text):
 736         Templates[page[page.find(':')+1:].lower()] = text
 737
 738 class WordHandler(WikiHandler):
 739     def checkPage(self, page):
 740         return ':' not in page
 741     def doPage(self, page, text):
 742         m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE)
 743         if m:
 744             out.write("  See <%s>" % page)
 745             return
 746         doc = parse(page, text)
 747         out.write(formatBrief(page, doc))
 748         #print formatBrief(page, doc)
 749
 750 fn = sys.argv[1]
 751 info = """   This file was converted from the original database on:
 752              %s
 753
 754    The original data is available from:
 755              http://en.wiktionary.org
 756    The version from which this file was generated was:
 757              %s
 758
 759   Wiktionary is available under the GNU Free Documentation License.
 760 """ % (time.ctime(), os.path.basename(fn))
 761
 762 errors = open("mkdict.err", "w")
 763
 764 Templates = {}
 765 f = os.popen("bunzip2 -c %s" % fn, "r")
 766 xml.sax.parse(f, TemplateHandler())
 767 f.close()
 768
 769 f = os.popen("bunzip2 -c %s" % fn, "r")
 770 out = os.popen("dictfmt -p wiktionary-en --utf8 --columns 0 -u http://en.wiktionary.org", "w")
 771
 772 out.write("%%h English Wiktionary\n%s" % info)
 773 xml.sax.parse(f, WordHandler())
 774 f.close()
 775 out.close()