1 # Adapted to produce DICT-compatible files by Petr Rockai in 2012
2 # Based on code from wiktiondict by Greg Hewgill
11 def __init__(self
, s
):
25 def append(self
, part
):
26 self
.parts
.append(part
)
28 return ''.join(x
.process() for x
in self
.parts
)
31 def __init__(self
, s
):
34 class OpenDouble
: pass
35 class OpenTriple
: pass
36 class CloseDouble
: pass
37 class CloseTriple
: pass
44 def __init__(self
, c
):
55 if s
[i
] == '{' and i
+1 < len(s
) and s
[i
+1] == '{':
58 if i
+2 < len(s
) and s
[i
+2] == '{':
67 elif s
[i
] == '}' and i
+1 < len(s
) and s
[i
+1] == '}':
77 elif i
+2 < len(s
) and s
[i
+2] == '}':
84 elif s
[i
] == ':' or s
[i
] == '|':
96 #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n':
100 # m = re.match(r"\s+", s[i:])
102 # yield Whitespace(m.group(0))
103 # i += len(m.group(0))
110 def processSub(templates
, tokens
, args
):
112 if not isinstance(t
, str):
117 if isinstance(t
, Delimiter
) and t
.c
== '|':
121 if isinstance(t
, str):
123 elif isinstance(t
, OpenDouble
):
124 default
+= processTemplateCall(templates
, tokens
, args
)
125 elif isinstance(t
, OpenTriple
):
126 default
+= processSub(templates
, tokens
, args
)
127 elif isinstance(t
, CloseTriple
):
130 print("Unexpected:", t
)
134 if default
is not None:
138 return "{{{%s}}}" % name
140 def processTemplateCall(templates
, tokens
, args
):
141 template
= tokens
.next().strip().lower()
146 if isinstance(t
, Delimiter
):
151 if isinstance(t
, str):
153 elif isinstance(t
, OpenDouble
):
154 arg
+= processTemplateCall(templates
, tokens
, args
)
155 elif isinstance(t
, OpenTriple
):
156 arg
+= processSub(templates
, tokens
, args
)
157 elif isinstance(t
, Delimiter
) and t
.c
!= '|':
161 if isinstance(t
, Equals
):
166 if isinstance(t
, (str, Equals
)):
168 elif isinstance(t
, OpenDouble
):
169 arg
+= processTemplateCall(templates
, tokens
, args
)
170 elif isinstance(t
, OpenTriple
):
171 arg
+= processSub(templates
, tokens
, args
)
172 elif isinstance(t
, Delimiter
) and t
.c
!= '|':
180 elif isinstance(t
, CloseDouble
):
183 print("Unexpected:", t
)
185 #print template, args
186 if template
[0] == '#':
187 if template
== "#if":
188 if args
['1'].strip():
194 elif template
== "#ifeq":
195 if args
['1'].strip() == args
['2'].strip():
201 elif template
== "#ifexist":
203 elif template
== "#switch":
204 sw
= args
['1'].strip()
210 print("Unknown ParserFunction:", template
)
212 if template
not in templates
:
213 return "{{%s}}" % template
214 return process(templates
, templates
[template
], args
)
216 def process(templates
, s
, args
= {}):
217 s
= re
.compile(r
"<!--.*?-->", re
.DOTALL
).sub("", s
)
218 s
= re
.compile(r
"<noinclude>.*?</noinclude>", re
.DOTALL
).sub("", s
)
219 assert "<onlyinclude>" not in s
220 #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s)
221 s
= re
.compile(r
"<includeonly>(.*?)</includeonly>", re
.DOTALL
).sub(r
"\1", s
)
223 #print list(Tokenise(s))
228 if isinstance(t
, OpenDouble
):
229 r
+= processTemplateCall(templates
, tokens
, args
)
230 elif isinstance(t
, OpenTriple
):
231 r
+= processSub(templates
, tokens
, args
)
234 except StopIteration:
241 'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].",
242 't': "start-{{{1|pqr}}}-end",
243 't0': "start-{{{1}}}-end",
244 't1': "start{{{1}}}end<noinclude>moo</noinclude>",
245 't2a1': "{{t2demo|a|{{{1}}}}}",
246 't2a2': "{{t2demo|a|2={{{1}}}}}",
247 't2demo': "start-{{{1}}}-middle-{{{2}}}-end",
248 't5': "{{t2demo|{{{a}}}=b}}",
251 def t(text
, expected
):
253 s
= process(templates
, text
)
256 print("expected:", expected
)
258 t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].")
259 t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].")
260 t("{{t0|a}}", "start-a-end")
261 t("{{t0| }}", "start- -end")
262 t("{{t0|}}", "start--end")
263 t("{{t0}}", "start-{{{1}}}-end")
264 t("{{t0| }}", "start- -end")
265 t("{{t0|\n}}", "start-\n-end")
266 t("{{t0|1= }}", "start--end")
267 t("{{t0|1=\n}}", "start--end")
268 t("{{T}}", "start-pqr-end")
269 t("{{T|}}", "start--end")
270 t("{{T|abc}}", "start-abc-end")
271 t("{{T|abc|def}}", "start-abc-end")
272 t("{{T|1=abc|1=def}}", "start-def-end")
273 t("{{T|abc|1=def}}", "start-def-end")
274 t("{{T|1=abc|def}}", "start-def-end")
275 t("{{T|{{T}}}}", "start-start-pqr-end-end")
276 t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end")
277 t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end")
278 t("{{T|a{{t|b}}}}", "start-astart-b-end-end")
279 t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end")
280 t("{{T|a=b}}", "start-pqr-end")
281 t("{{T|1=a=b}}", "start-a=b-end")
282 #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}")
283 #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end")
284 #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end")
285 #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end")
286 #t("{{ {{t6}} }}", "{{ t2demo|a }}")
287 t("{{t|[[a|b]]}}", "start-b-end")
288 t("{{t|[[a|b]] }}", "start-b -end")
291 # Standard POS headers
300 'Conjunction': "conj.",
301 'Interjection': "interj.",
302 'Preposition': "prep.",
303 'Proper noun': "n.p.",
304 'Proper Noun': "n.p.",
307 # Standard non-POS level 3 headers
308 '{{acronym}}': "acr.",
310 '{{abbreviation}}': "abbr.",
311 '[[Abbreviation]]': "abbr.",
312 'Abbreviation': "abbr.",
313 '[[initialism]]': "init.",
314 '{{initialism}}': "init.",
315 'Initialism': "init.",
316 'Contraction': "cont.",
325 # Debated POS level 3 headers
328 'Cardinal number': "num.",
329 'Ordinal number': "num.",
330 'Cardinal numeral': "num.",
331 'Ordinal numeral': "num.",
333 # Other headers in use
334 'Personal pronoun': "pers.pron.",
335 'Adjective/Adverb': "adj./adv.",
336 'Proper adjective': "prop.adj.",
337 'Determiner': "det.",
338 'Demonstrative determiner': "dem.det.",
341 'Counter': "counter",
343 'Kanji reading': None,
344 'Hiragana letter': None,
345 'Katakana letter': None,
347 'Han character': None,
352 'Adjectival noun': None,
353 'Quasi-adjective': None,
355 'Infinitive particle': "part.",
356 'Possessive adjective': "poss.adj.",
357 'Verbal prefix': "v.p.",
358 'Postposition': "post.",
359 'Prepositional article': "prep.art.",
360 'Phrasal verb': "phr.v.",
361 'Participle': "participle",
362 'Interrogative auxiliary verb': "int.aux.v.",
363 'Pronominal adverb': "pron.adv.",
365 'Abstract pronoun': "abs.pron.",
366 'Conjunction particle': None,
369 # Non-standard, deprecated headers
372 'Adjective form': "adj.form.",
373 'Nominal phrase': "nom.phr.",
374 'Noun phrase': "n. phrase",
375 'Verb phrase': "v. phrase",
376 'Transitive verb': "v.t.",
377 'Intransitive verb': "v.i.",
378 'Reflexive verb': "v.r.",
386 'Alternative forms': None,
387 'Alternative spellings': None,
393 'Declension and pronunciations': None,
394 'Definite Article': "def.art.",
395 'Definite article': "def.art.",
396 'Demonstrative pronoun': "dem.pron.",
398 'Derived expression': None,
399 'Derived expressions': None,
400 'Derived forms': None,
401 'Derived phrases': None,
402 'Derived terms': None,
403 'Derived, Related terms': None,
406 #'Etymology 1': None,
407 #'Etymology 2': None,
408 #'Etymology 3': None,
409 #'Etymology 4': None,
410 #'Etymology 5': None,
412 'External links': None,
418 'Indefinite article': "art.",
419 'Indefinite pronoun': "ind.pron.",
420 'Indefinite Pronoun': "ind.pron.",
421 'Indetermined pronoun': "ind.pron.",
422 'Interrogative conjunction': "int.conj.",
423 'Interrogative determiner': "int.det.",
424 'Interrogative particle': "int.part.",
425 'Interrogative pronoun': "int.pron.",
426 'Legal expression': "legal",
428 'Miscellaneous': None,
430 'Noun and verb': "n/v.",
431 'Other language': None,
432 'Pinyin syllable': None,
433 'Possessive determiner': "poss.det.",
434 'Possessive pronoun': "poss.pron.",
435 'Prepositional phrase': "prep.phr.",
436 'Prepositional Pronoun': "prep.pron.",
437 'Pronunciation': None,
438 'Pronunciation 1': None,
439 'Pronunciation 2': None,
442 'Reflexive pronoun': "refl.pron.",
443 'Related expressions': None,
444 'Related terms': None,
445 'Related words': None,
446 'Relative pronoun': "rel.pron.",
450 '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None,
451 'Sister projects': None,
452 'Spelling note': None,
455 'Translations': None,
456 'Translations to be checked': None,
457 'Transliteration': None,
460 'Usage in English': None,
462 'Verbal noun': "v.n.",
465 for p
in list(Parts
.keys()):
470 assert r
[1] == len(s
)
474 a
= m
.group(1).split("|")
485 aa
= m
.group(1).split("|")
489 am
= re
.match(r
"(.*?)(=(.*))?", a
)
491 args
[am
.group(1)] = am
.group(3)
494 args
[n
] = am
.group(1)
496 #if aa[0] in seentemplates:
497 # seentemplates[aa[0]] += 1
499 # seentemplates[aa[0]] = 1
500 # print len(seentemplates), aa[0]
503 #if aa[0] not in Templates:
504 # return "(unknown template %s)" % aa[0]
505 #body = Templates[aa[0]]
506 #body = re.sub(r"<noinclude>.*?</noinclude>", "", body)
507 #assert "<onlyinclude>" not in body
508 ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body)
509 #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body)
510 #def dotemplatearg(m):
511 # ta = m.group(1).split("|")
517 # return "{{{%s}}}" % ta[0]
518 #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body)
521 def doparserfunction(m
):
522 a
= m
.group(2).split("|")
523 if m
.group(1) == "ifeq":
530 def dewiki(body
, indent
= 0):
531 # process in this order:
538 #body = wikimediatemplate.process(Templates, body)
539 body
= re
.sub(r
"\[\[(.*?)\]\]", dowikilink
, body
)
540 #body = re.sub(r"{{(.*?)}}", dotemplate, body)
541 #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body)
542 body
= re
.sub(r
"'''(.*?)'''", r
"\1", body
)
543 body
= re
.sub(r
"''(.*?)''", r
"\1", body
)
544 lines
= body
.split("\n")
547 while i
< len(lines
):
548 if len(lines
[i
]) > 0 and lines
[i
][0] == "#":
549 if len(lines
[i
]) > 1 and lines
[i
][1] == '*':
550 wlines
= textwrap
.wrap(lines
[i
][2:].strip(),
551 initial_indent
= " * ",
552 subsequent_indent
= " ")
553 elif len(lines
[i
]) > 1 and lines
[i
][1] == ':':
554 wlines
= textwrap
.wrap(lines
[i
][2:].strip(),
555 initial_indent
= " ",
556 subsequent_indent
= " ")
559 wlines
= textwrap
.wrap(str(n
) + ". " + lines
[i
][1:].strip(),
560 subsequent_indent
= " ")
561 elif len(lines
[i
]) > 0 and lines
[i
][0] == "*":
563 wlines
= textwrap
.wrap(lines
[i
][1:].strip(),
564 initial_indent
= "* ",
565 subsequent_indent
= " ")
568 wlines
= textwrap
.wrap(lines
[i
].strip())
571 lines
[i
:i
+1] = wlines
573 return ''.join(" "*(indent
-1)+x
+"\n" for x
in lines
)
576 def __init__(self
, heading
, body
):
577 self
.heading
= heading
579 #self.lines = re.split("\n+", body.strip())
580 #if len(self.lines) == 1 and len(self.lines[0]) == 0:
584 return "<%s:%i:%s>" % (self
.heading
, len(self
.body
or ""), ','.join([str(x
) for x
in self
.children
]))
585 def add(self
, section
):
586 self
.children
.append(section
)
588 def parse(word
, text
):
589 headings
= list(re
.finditer("^(=+)\s*(.*?)\s*=+\n", text
, re
.MULTILINE
))
590 #print [x.group(1) for x in headings]
591 doc
= WikiSection(word
, "")
593 for i
, m
in enumerate(headings
):
594 depth
= len(m
.group(1))
595 if depth
< len(stack
):
596 stack
= stack
[:depth
]
598 while depth
> len(stack
):
599 s
= WikiSection(None, "")
602 if i
+1 < len(headings
):
603 s
= WikiSection(m
.group(2), text
[m
.end(0):headings
[i
+1].start(0)].strip())
605 s
= WikiSection(m
.group(2), text
[m
.end(0):].strip())
606 assert len(stack
) == depth
609 #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1:
610 # doc = doc.children[0]
613 def formatFull(word
, doc
):
614 def f(depth
, section
):
616 r
= " "*(depth
-1) + section
.heading
+ "\n\n"
620 r
+= dewiki(section
.body
, depth
+1)+"\n"
621 #r += "".join(" "*depth + x + "\n" for x in dewiki(section.body))
622 #if len(section.lines) > 0:
624 for c
in section
.children
:
628 s
+= "Ref: http://en.wiktionary.org/wiki/%s\n" % word
631 def formatNormal(word
, doc
):
632 def f(depth
, posdepth
, section
):
634 if depth
== posdepth
:
635 if not section
.heading
or section
.heading
.startswith("Etymology"):
637 elif section
.heading
in Parts
:
638 #p = Parts[section.heading]
640 # r += " "*(depth-1) + word + " (" + p + ")\n\n"
641 r
+= " "*(depth
-1) + section
.heading
+ "\n\n"
643 print("Unknown part: (%s) %s" % (word
, section
.heading
), file=errors
)
645 elif depth
> posdepth
:
647 elif section
.heading
:
648 r
+= " "*(depth
-1) + section
.heading
+ "\n\n"
650 r
+= dewiki(section
.body
, depth
+1)+"\n"
651 #r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines))
652 #if len(section.lines) > 0:
654 for c
in section
.children
:
655 r
+= f(depth
+1, posdepth
, c
)
658 s
+= "Ref: http://en.wiktionary.org/wiki/%s\n" % word
661 def formatBrief(word
, doc
):
662 def f(depth
, posdepth
, section
):
663 if depth
== posdepth
:
665 if not section
.heading
or section
.heading
.startswith("Etymology"):
667 elif section
.heading
in Parts
:
668 #h = Parts[section.heading]
670 # h = "%s (%s)" % (word, h)
672 stack
.append([h
, False])
674 stack
.append([section
.heading
, False])
676 stack
.append(["%h " + section
.heading
, False])
679 # r += " "*(depth-1) + section.heading + "\n"
680 body
= ''.join(x
+"\n" for x
in section
.body
.split("\n") if len(x
) > 0 and x
[0] == '#')
682 for i
in range(len(stack
)):
685 r
+= " "*(i
-1) + stack
[i
][0] + "\n"
687 r
+= dewiki(body
, depth
+1)
688 for c
in section
.children
:
689 r
+= f(depth
+1, posdepth
, c
)
694 s
+= "Ref: http://en.wiktionary.org/wiki/%s\n" % word
697 class WikiHandler(xml
.sax
.ContentHandler
):
703 def startElement(self
, name
, attrs
):
704 #print "start", name, attrs
706 def endElement(self
, name
):
708 if self
.element
== "text":
710 if self
.page
in self
.long:
711 print(self
.page
, len(self
.text
))
713 self
.doPage(self
.page
, self
.text
)
717 def characters(self
, content
):
718 #print "characters", content
719 if self
.element
== "title":
720 if self
.checkPage(content
):
722 elif self
.element
== "text":
725 if len(self
.text
) > 100000 and self
.page
not in self
.long:
726 self
.long[self
.page
] = 1
727 def checkPage(self
, page
):
729 def doPage(self
, page
, text
):
732 class TemplateHandler(WikiHandler
):
733 def checkPage(self
, page
):
734 return page
.startswith("Template:")
735 def doPage(self
, page
, text
):
736 Templates
[page
[page
.find(':')+1:].lower()] = text
738 class WordHandler(WikiHandler
):
739 def checkPage(self
, page
):
740 return ':' not in page
741 def doPage(self
, page
, text
):
742 m
= re
.match(r
"#redirect\s*\[\[(.*?)\]\]", text
, re
.IGNORECASE
)
744 out
.write(" See <%s>" % page
)
746 doc
= parse(page
, text
)
747 out
.write(formatBrief(page
, doc
))
748 #print formatBrief(page, doc)
751 info
= """ This file was converted from the original database on:
754 The original data is available from:
755 http://en.wiktionary.org
756 The version from which this file was generated was:
759 Wiktionary is available under the GNU Free Documentation License.
760 """ % (time
.ctime(), os
.path
.basename(fn
))
762 errors
= open("mkdict.err", "w")
765 f
= os
.popen("bunzip2 -c %s" % fn
, "r")
766 xml
.sax
.parse(f
, TemplateHandler())
769 f
= os
.popen("bunzip2 -c %s" % fn
, "r")
770 out
= os
.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")
772 out
.write("%%h English Wiktionary\n%s" % info
)
773 xml
.sax
.parse(f
, WordHandler())