Merge pull request #330634 from r-ryantm/auto-update/circumflex
[NixPkgs.git] / pkgs / servers / dict / wiktionary / wiktionary2dict.py
bloba737079bf5c8ad8682a9655c7ce1e5679ae4dc09
1 # Adapted to produce DICT-compatible files by Petr Rockai in 2012
2 # Based on code from wiktiondict by Greg Hewgill
3 import re
4 import sys
5 import os
6 import textwrap
7 import time
8 import xml.sax
10 class Text:
11 def __init__(self, s):
12 self.s = s
13 def process(self):
14 return s
16 class TemplateCall:
17 def __init__(self):
18 pass
19 def process(self):
20 pass
22 class Template:
23 def __init__(self):
24 self.parts = []
25 def append(self, part):
26 self.parts.append(part)
27 def process(self):
28 return ''.join(x.process() for x in self.parts)
30 class Whitespace:
31 def __init__(self, s):
32 self.s = s
34 class OpenDouble: pass
35 class OpenTriple: pass
36 class CloseDouble: pass
37 class CloseTriple: pass
39 class Equals:
40 def __str__(self):
41 return "="
43 class Delimiter:
44 def __init__(self, c):
45 self.c = c
46 def __str__(self):
47 return self.c
49 def Tokenise(s):
50 s = str(s)
51 stack = []
52 last = 0
53 i = 0
54 while i < len(s):
55 if s[i] == '{' and i+1 < len(s) and s[i+1] == '{':
56 if i > last:
57 yield s[last:i]
58 if i+2 < len(s) and s[i+2] == '{':
59 yield OpenTriple()
60 stack.append(3)
61 i += 3
62 else:
63 yield OpenDouble()
64 stack.append(2)
65 i += 2
66 last = i
67 elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}':
68 if i > last:
69 yield s[last:i]
70 if len(stack) == 0:
71 yield "}}"
72 i += 2
73 elif stack[-1] == 2:
74 yield CloseDouble()
75 i += 2
76 stack.pop()
77 elif i+2 < len(s) and s[i+2] == '}':
78 yield CloseTriple()
79 i += 3
80 stack.pop()
81 else:
82 raise SyntaxError()
83 last = i
84 elif s[i] == ':' or s[i] == '|':
85 if i > last:
86 yield s[last:i]
87 yield Delimiter(s[i])
88 i += 1
89 last = i
90 elif s[i] == '=':
91 if i > last:
92 yield s[last:i]
93 yield Equals()
94 i += 1
95 last = i
96 #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
97 # if i > last:
98 # yield s[last:i]
99 # last = i
100 # m = re.match(r"\s+", s[i:])
101 # assert m
102 # yield Whitespace(m.group(0))
103 # i += len(m.group(0))
104 # last = i
105 else:
106 i += 1
107 if i > last:
108 yield s[last:i]
110 def processSub(templates, tokens, args):
111 t = next(tokens)
112 if not isinstance(t, str):
113 raise SyntaxError
114 name = t
115 t = next(tokens)
116 default = None
117 if isinstance(t, Delimiter) and t.c == '|':
118 default = ""
119 while True:
120 t = next(tokens)
121 if isinstance(t, str):
122 default += t
123 elif isinstance(t, OpenDouble):
124 default += processTemplateCall(templates, tokens, args)
125 elif isinstance(t, OpenTriple):
126 default += processSub(templates, tokens, args)
127 elif isinstance(t, CloseTriple):
128 break
129 else:
130 print("Unexpected:", t)
131 raise SyntaxError()
132 if name in args:
133 return args[name]
134 if default is not None:
135 return default
136 if name == "lang":
137 return "en"
138 return "{{{%s}}}" % name
140 def processTemplateCall(templates, tokens, args):
141 template = tokens.next().strip().lower()
142 args = {}
143 a = 1
144 t = next(tokens)
145 while True:
146 if isinstance(t, Delimiter):
147 name = str(a)
148 arg = ""
149 while True:
150 t = next(tokens)
151 if isinstance(t, str):
152 arg += t
153 elif isinstance(t, OpenDouble):
154 arg += processTemplateCall(templates, tokens, args)
155 elif isinstance(t, OpenTriple):
156 arg += processSub(templates, tokens, args)
157 elif isinstance(t, Delimiter) and t.c != '|':
158 arg += str(t)
159 else:
160 break
161 if isinstance(t, Equals):
162 name = arg.strip()
163 arg = ""
164 while True:
165 t = next(tokens)
166 if isinstance(t, (str, Equals)):
167 arg += str(t)
168 elif isinstance(t, OpenDouble):
169 arg += processTemplateCall(templates, tokens, args)
170 elif isinstance(t, OpenTriple):
171 arg += processSub(templates, tokens, args)
172 elif isinstance(t, Delimiter) and t.c != '|':
173 arg += str(t)
174 else:
175 break
176 arg = arg.strip()
177 else:
178 a += 1
179 args[name] = arg
180 elif isinstance(t, CloseDouble):
181 break
182 else:
183 print("Unexpected:", t)
184 raise SyntaxError
185 #print template, args
186 if template[0] == '#':
187 if template == "#if":
188 if args['1'].strip():
189 return args['2']
190 elif '3' in args:
191 return args['3']
192 else:
193 return ""
194 elif template == "#ifeq":
195 if args['1'].strip() == args['2'].strip():
196 return args['3']
197 elif '4' in args:
198 return args['4']
199 else:
200 return ""
201 elif template == "#ifexist":
202 return ""
203 elif template == "#switch":
204 sw = args['1'].strip()
205 if sw in args:
206 return args[sw]
207 else:
208 return ""
209 else:
210 print("Unknown ParserFunction:", template)
211 sys.exit(1)
212 if template not in templates:
213 return "{{%s}}" % template
214 return process(templates, templates[template], args)
216 def process(templates, s, args = {}):
217 s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s)
218 s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s)
219 assert "<onlyinclude>" not in s
220 #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
221 s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s)
222 r = ""
223 #print list(Tokenise(s))
224 tokens = Tokenise(s)
225 try:
226 while True:
227 t = next(tokens)
228 if isinstance(t, OpenDouble):
229 r += processTemplateCall(templates, tokens, args)
230 elif isinstance(t, OpenTriple):
231 r += processSub(templates, tokens, args)
232 else:
233 r += str(t)
234 except StopIteration:
235 pass
236 return r
238 def test():
239 templates = {
240 'lb': "{{",
241 'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
242 't': "start-{{{1|pqr}}}-end",
243 't0': "start-{{{1}}}-end",
244 't1': "start{{{1}}}end<noinclude>moo</noinclude>",
245 't2a1': "{{t2demo|a|{{{1}}}}}",
246 't2a2': "{{t2demo|a|2={{{1}}}}}",
247 't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
248 't5': "{{t2demo|{{{a}}}=b}}",
249 't6': "t2demo|a",
251 def t(text, expected):
252 print("text:", text)
253 s = process(templates, text)
254 if s != expected:
255 print("got:", s)
256 print("expected:", expected)
257 sys.exit(1)
258 t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
259 t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
260 t("{{t0|a}}", "start-a-end")
261 t("{{t0| }}", "start- -end")
262 t("{{t0|}}", "start--end")
263 t("{{t0}}", "start-{{{1}}}-end")
264 t("{{t0| }}", "start- -end")
265 t("{{t0|\n}}", "start-\n-end")
266 t("{{t0|1= }}", "start--end")
267 t("{{t0|1=\n}}", "start--end")
268 t("{{T}}", "start-pqr-end")
269 t("{{T|}}", "start--end")
270 t("{{T|abc}}", "start-abc-end")
271 t("{{T|abc|def}}", "start-abc-end")
272 t("{{T|1=abc|1=def}}", "start-def-end")
273 t("{{T|abc|1=def}}", "start-def-end")
274 t("{{T|1=abc|def}}", "start-def-end")
275 t("{{T|{{T}}}}", "start-start-pqr-end-end")
276 t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
277 t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
278 t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
279 t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
280 t("{{T|a=b}}", "start-pqr-end")
281 t("{{T|1=a=b}}", "start-a=b-end")
282 #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
283 #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
284 #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
285 #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
286 #t("{{ {{t6}} }}", "{{ t2demo|a }}")
287 t("{{t|[[a|b]]}}", "start-b-end")
288 t("{{t|[[a|b]] }}", "start-b -end")
290 Parts = {
291 # Standard POS headers
292 'noun': "n.",
293 'Noun': "n.",
294 'Noun 1': "n.",
295 'Noun 2': "n.",
296 'Verb': "v.",
297 'Adjective': "adj.",
298 'Adverb': "adv.",
299 'Pronoun': "pron.",
300 'Conjunction': "conj.",
301 'Interjection': "interj.",
302 'Preposition': "prep.",
303 'Proper noun': "n.p.",
304 'Proper Noun': "n.p.",
305 'Article': "art.",
307 # Standard non-POS level 3 headers
308 '{{acronym}}': "acr.",
309 'Acronym': "acr.",
310 '{{abbreviation}}': "abbr.",
311 '[[Abbreviation]]': "abbr.",
312 'Abbreviation': "abbr.",
313 '[[initialism]]': "init.",
314 '{{initialism}}': "init.",
315 'Initialism': "init.",
316 'Contraction': "cont.",
317 'Prefix': "prefix",
318 'Suffix': "suffix",
319 'Symbol': "sym.",
320 'Letter': "letter",
321 'Idiom': "idiom",
322 'Idioms': "idiom",
323 'Phrase': "phrase",
325 # Debated POS level 3 headers
326 'Number': "num.",
327 'Numeral': "num.",
328 'Cardinal number': "num.",
329 'Ordinal number': "num.",
330 'Cardinal numeral': "num.",
331 'Ordinal numeral': "num.",
333 # Other headers in use
334 'Personal pronoun': "pers.pron.",
335 'Adjective/Adverb': "adj./adv.",
336 'Proper adjective': "prop.adj.",
337 'Determiner': "det.",
338 'Demonstrative determiner': "dem.det.",
339 'Clitic': "clitic",
340 'Infix': "infix",
341 'Counter': "counter",
342 'Kanji': None,
343 'Kanji reading': None,
344 'Hiragana letter': None,
345 'Katakana letter': None,
346 'Pinyin': None,
347 'Han character': None,
348 'Hanzi': None,
349 'Hanja': None,
350 'Proverb': "prov.",
351 'Expression': None,
352 'Adjectival noun': None,
353 'Quasi-adjective': None,
354 'Particle': "part.",
355 'Infinitive particle': "part.",
356 'Possessive adjective': "poss.adj.",
357 'Verbal prefix': "v.p.",
358 'Postposition': "post.",
359 'Prepositional article': "prep.art.",
360 'Phrasal verb': "phr.v.",
361 'Participle': "participle",
362 'Interrogative auxiliary verb': "int.aux.v.",
363 'Pronominal adverb': "pron.adv.",
364 'Adnominal': "adn.",
365 'Abstract pronoun': "abs.pron.",
366 'Conjunction particle': None,
367 'Root': "root",
369 # Non-standard, deprecated headers
370 'Noun form': "n.",
371 'Verb form': "v.",
372 'Adjective form': "adj.form.",
373 'Nominal phrase': "nom.phr.",
374 'Noun phrase': "n. phrase",
375 'Verb phrase': "v. phrase",
376 'Transitive verb': "v.t.",
377 'Intransitive verb': "v.i.",
378 'Reflexive verb': "v.r.",
379 'Cmavo': None,
380 'Romaji': "rom.",
381 'Hiragana': None,
382 'Furigana': None,
383 'Compounds': None,
385 # Other headers seen
386 'Alternative forms': None,
387 'Alternative spellings': None,
388 'Anagrams': None,
389 'Antonym': None,
390 'Antonyms': None,
391 'Conjugation': None,
392 'Declension': None,
393 'Declension and pronunciations': None,
394 'Definite Article': "def.art.",
395 'Definite article': "def.art.",
396 'Demonstrative pronoun': "dem.pron.",
397 'Derivation': None,
398 'Derived expression': None,
399 'Derived expressions': None,
400 'Derived forms': None,
401 'Derived phrases': None,
402 'Derived terms': None,
403 'Derived, Related terms': None,
404 'Descendants': None,
405 #'Etymology': None,
406 #'Etymology 1': None,
407 #'Etymology 2': None,
408 #'Etymology 3': None,
409 #'Etymology 4': None,
410 #'Etymology 5': None,
411 'Examples': None,
412 'External links': None,
413 '[[Gismu]]': None,
414 'Gismu': None,
415 'Homonyms': None,
416 'Homophones': None,
417 'Hyphenation': None,
418 'Indefinite article': "art.",
419 'Indefinite pronoun': "ind.pron.",
420 'Indefinite Pronoun': "ind.pron.",
421 'Indetermined pronoun': "ind.pron.",
422 'Interrogative conjunction': "int.conj.",
423 'Interrogative determiner': "int.det.",
424 'Interrogative particle': "int.part.",
425 'Interrogative pronoun': "int.pron.",
426 'Legal expression': "legal",
427 'Mass noun': "n.",
428 'Miscellaneous': None,
429 'Mutations': None,
430 'Noun and verb': "n/v.",
431 'Other language': None,
432 'Pinyin syllable': None,
433 'Possessive determiner': "poss.det.",
434 'Possessive pronoun': "poss.pron.",
435 'Prepositional phrase': "prep.phr.",
436 'Prepositional Pronoun': "prep.pron.",
437 'Pronunciation': None,
438 'Pronunciation 1': None,
439 'Pronunciation 2': None,
440 'Quotations': None,
441 'References': None,
442 'Reflexive pronoun': "refl.pron.",
443 'Related expressions': None,
444 'Related terms': None,
445 'Related words': None,
446 'Relative pronoun': "rel.pron.",
447 'Saying': "saying",
448 'See also': None,
449 'Shorthand': None,
450 '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
451 'Sister projects': None,
452 'Spelling note': None,
453 'Synonyms': None,
454 'Translation': None,
455 'Translations': None,
456 'Translations to be checked': None,
457 'Transliteration': None,
458 'Trivia': None,
459 'Usage': None,
460 'Usage in English': None,
461 'Usage notes': None,
462 'Verbal noun': "v.n.",
464 PartsUsed = {}
465 for p in list(Parts.keys()):
466 PartsUsed[p] = 0
468 def encode(s):
469 r = e(s)
470 assert r[1] == len(s)
471 return r[0]
473 def dowikilink(m):
474 a = m.group(1).split("|")
475 if len(a) > 1:
476 link = a[1]
477 else:
478 link = a[0]
479 if ':' in link:
480 link = ""
481 return link
483 seentemplates = {}
484 def dotemplate(m):
485 aa = m.group(1).split("|")
486 args = {}
487 n = 0
488 for a in aa:
489 am = re.match(r"(.*?)(=(.*))?", a)
490 if am:
491 args[am.group(1)] = am.group(3)
492 else:
493 n += 1
494 args[n] = am.group(1)
496 #if aa[0] in seentemplates:
497 # seentemplates[aa[0]] += 1
498 #else:
499 # seentemplates[aa[0]] = 1
500 # print len(seentemplates), aa[0]
501 #print aa[0]
503 #if aa[0] not in Templates:
504 # return "(unknown template %s)" % aa[0]
505 #body = Templates[aa[0]]
506 #body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
507 #assert "<onlyinclude>" not in body
508 ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
509 #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
510 #def dotemplatearg(m):
511 # ta = m.group(1).split("|")
512 # if ta[0] in args:
513 # return args[ta[0]]
514 # elif len(ta) > 1:
515 # return ta[1]
516 # else:
517 # return "{{{%s}}}" % ta[0]
518 #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
519 #return dewiki(body)
521 def doparserfunction(m):
522 a = m.group(2).split("|")
523 if m.group(1) == "ifeq":
524 if a[0] == a[1]:
525 return a[2]
526 elif len(a) >= 4:
527 return a[3]
528 return ""
530 def dewiki(body, indent = 0):
531 # process in this order:
532 # {{{ }}}
533 # <> <>
534 # [[ ]]
535 # {{ }}
536 # ''' '''
537 # '' ''
538 #body = wikimediatemplate.process(Templates, body)
539 body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body)
540 #body = re.sub(r"{{(.*?)}}", dotemplate, body)
541 #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
542 body = re.sub(r"'''(.*?)'''", r"\1", body)
543 body = re.sub(r"''(.*?)''", r"\1", body)
544 lines = body.split("\n")
545 n = 0
546 i = 0
547 while i < len(lines):
548 if len(lines[i]) > 0 and lines[i][0] == "#":
549 if len(lines[i]) > 1 and lines[i][1] == '*':
550 wlines = textwrap.wrap(lines[i][2:].strip(),
551 initial_indent = " * ",
552 subsequent_indent = " ")
553 elif len(lines[i]) > 1 and lines[i][1] == ':':
554 wlines = textwrap.wrap(lines[i][2:].strip(),
555 initial_indent = " ",
556 subsequent_indent = " ")
557 else:
558 n += 1
559 wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(),
560 subsequent_indent = " ")
561 elif len(lines[i]) > 0 and lines[i][0] == "*":
562 n = 0
563 wlines = textwrap.wrap(lines[i][1:].strip(),
564 initial_indent = "* ",
565 subsequent_indent = " ")
566 else:
567 n = 0
568 wlines = textwrap.wrap(lines[i].strip())
569 if len(wlines) == 0:
570 wlines = ['']
571 lines[i:i+1] = wlines
572 i += len(wlines)
573 return ''.join(" "*(indent-1)+x+"\n" for x in lines)
575 class WikiSection:
576 def __init__(self, heading, body):
577 self.heading = heading
578 self.body = body
579 #self.lines = re.split("\n+", body.strip())
580 #if len(self.lines) == 1 and len(self.lines[0]) == 0:
581 # self.lines = []
582 self.children = []
583 def __str__(self):
584 return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children]))
585 def add(self, section):
586 self.children.append(section)
588 def parse(word, text):
589 headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE))
590 #print [x.group(1) for x in headings]
591 doc = WikiSection(word, "")
592 stack = [doc]
593 for i, m in enumerate(headings):
594 depth = len(m.group(1))
595 if depth < len(stack):
596 stack = stack[:depth]
597 else:
598 while depth > len(stack):
599 s = WikiSection(None, "")
600 stack[-1].add(s)
601 stack.append(s)
602 if i+1 < len(headings):
603 s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip())
604 else:
605 s = WikiSection(m.group(2), text[m.end(0):].strip())
606 assert len(stack) == depth
607 stack[-1].add(s)
608 stack.append(s)
609 #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
610 # doc = doc.children[0]
611 return doc
613 def formatFull(word, doc):
614 def f(depth, section):
615 if section.heading:
616 r = " "*(depth-1) + section.heading + "\n\n"
617 else:
618 r = ""
619 if section.body:
620 r += dewiki(section.body, depth+1)+"\n"
621 #r += "".join(" "*depth + x + "\n" for x in dewiki(section.body))
622 #if len(section.lines) > 0:
623 # r += "\n"
624 for c in section.children:
625 r += f(depth+1, c)
626 return r
627 s = f(0, doc)
628 s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
629 return s
631 def formatNormal(word, doc):
632 def f(depth, posdepth, section):
633 r = ""
634 if depth == posdepth:
635 if not section.heading or section.heading.startswith("Etymology"):
636 posdepth += 1
637 elif section.heading in Parts:
638 #p = Parts[section.heading]
639 #if p:
640 # r += " "*(depth-1) + word + " (" + p + ")\n\n"
641 r += " "*(depth-1) + section.heading + "\n\n"
642 else:
643 print("Unknown part: (%s) %s" % (word, section.heading), file=errors)
644 return ""
645 elif depth > posdepth:
646 return ""
647 elif section.heading:
648 r += " "*(depth-1) + section.heading + "\n\n"
649 if section.body:
650 r += dewiki(section.body, depth+1)+"\n"
651 #r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines))
652 #if len(section.lines) > 0:
653 # r += "\n"
654 for c in section.children:
655 r += f(depth+1, posdepth, c)
656 return r
657 s = f(0, 3, doc)
658 s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
659 return s
661 def formatBrief(word, doc):
662 def f(depth, posdepth, section):
663 if depth == posdepth:
664 h = section.heading
665 if not section.heading or section.heading.startswith("Etymology"):
666 posdepth += 1
667 elif section.heading in Parts:
668 #h = Parts[section.heading]
669 #if h:
670 # h = "%s (%s)" % (word, h)
671 pass
672 stack.append([h, False])
673 elif depth > 0:
674 stack.append([section.heading, False])
675 else:
676 stack.append(["%h " + section.heading, False])
677 r = ""
678 #if section.heading:
679 # r += " "*(depth-1) + section.heading + "\n"
680 body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#')
681 if len(body) > 0:
682 for i in range(len(stack)):
683 if not stack[i][1]:
684 if stack[i][0]:
685 r += " "*(i-1) + stack[i][0] + "\n"
686 stack[i][1] = True
687 r += dewiki(body, depth+1)
688 for c in section.children:
689 r += f(depth+1, posdepth, c)
690 stack.pop()
691 return r
692 stack = []
693 s = f(0, 3, doc)
694 s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word
695 return s
697 class WikiHandler(xml.sax.ContentHandler):
698 def __init__(self):
699 self.element = None
700 self.page = None
701 self.text = ""
702 self.long = {}
703 def startElement(self, name, attrs):
704 #print "start", name, attrs
705 self.element = name
706 def endElement(self, name):
707 #print "end", name
708 if self.element == "text":
709 if self.page:
710 if self.page in self.long:
711 print(self.page, len(self.text))
712 print()
713 self.doPage(self.page, self.text)
714 self.page = None
715 self.text = ""
716 self.element = None
717 def characters(self, content):
718 #print "characters", content
719 if self.element == "title":
720 if self.checkPage(content):
721 self.page = content
722 elif self.element == "text":
723 if self.page:
724 self.text += content
725 if len(self.text) > 100000 and self.page not in self.long:
726 self.long[self.page] = 1
727 def checkPage(self, page):
728 return False
729 def doPage(self, page, text):
730 pass
732 class TemplateHandler(WikiHandler):
733 def checkPage(self, page):
734 return page.startswith("Template:")
735 def doPage(self, page, text):
736 Templates[page[page.find(':')+1:].lower()] = text
738 class WordHandler(WikiHandler):
739 def checkPage(self, page):
740 return ':' not in page
741 def doPage(self, page, text):
742 m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE)
743 if m:
744 out.write(" See <%s>" % page)
745 return
746 doc = parse(page, text)
747 out.write(formatBrief(page, doc))
748 #print formatBrief(page, doc)
750 fn = sys.argv[1]
751 info = """ This file was converted from the original database on:
754 The original data is available from:
755 http://en.wiktionary.org
756 The version from which this file was generated was:
759 Wiktionary is available under the GNU Free Documentation License.
760 """ % (time.ctime(), os.path.basename(fn))
762 errors = open("mkdict.err", "w")
764 Templates = {}
765 f = os.popen("bunzip2 -c %s" % fn, "r")
766 xml.sax.parse(f, TemplateHandler())
767 f.close()
769 f = os.popen("bunzip2 -c %s" % fn, "r")
770 out = os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")
772 out.write("%%h English Wiktionary\n%s" % info)
773 xml.sax.parse(f, WordHandler())
774 f.close()
775 out.close()