Started to produce top-down code for JMdict SQLite3 database.
[jblite.git] / jblite / jmdict_proto1.py
blobb5147459b7d71bdfadf4ea4112dd8fa527c9fb5f
1 import warnings
2 from lxml import etree
5 # lxml normalizes XML namespaces... so for xml:lang, we use the
6 # following constant.
7 XML_LANG = '{http://www.w3.org/XML/1998/namespace}lang' # xml:lang
10 def skip_elem(elem_name):
11 warnings.warn("Encountered unsuppored element <%s>. "
12 "<%s> elements will be skipped." % (elem_name, elem_name))
14 def main():
15 parser = etree.XMLParser(resolve_entities=False)
16 import sys
17 with open(sys.argv[1]) as infile:
18 et = etree.ElementTree(file=infile, parser=parser)
19 root = et.getroot()
20 for child in root.iterchildren():
21 ent_seq = int(child.find('ent_seq').text)
22 for k_ele in child.iterfind('k_ele'):
23 blob = k_ele.find('keb').text
24 #print u"k_ele.keb:", blob
25 for ke_inf in k_ele.iterfind('ke_inf'):
26 info = ke_inf.text
27 #print u"k_ele.ke_inf:", info
28 for ke_pri in k_ele.iterfind('ke_pri'):
29 priority = ke_pri.text
30 #print u"k_ele.ke_pri:", priority
31 for r_ele in child.iterfind('r_ele'):
32 blob = r_ele.find('reb').text
33 #print u"r_ele.reb:", blob
34 nokanji = True if r_ele.find('re_nokanji') is not None \
35 else False
36 #print u"r_ele.nokanji:", nokanji
37 for re_restr in r_ele.iterfind('re_restr'):
38 restr = re_restr.text
39 #print u"r_ele.re_restr:", restr
40 for re_inf in r_ele.iterfind('re_inf'):
41 info = re_inf.text
42 #print u"r_ele.re_inf:", info
43 for re_pri in r_ele.iterfind('re_pri'):
44 priority = re_pri.text
45 #print u"r_ele.re_pri:", priority
46 info = child.find('info')
47 if info is not None:
48 for links in info.iterfind('links'):
49 tag = links.find('link_tag').text
50 desc = links.find('link_desc').text
51 uri = links.find('link_uri').text
52 # do something...
53 for bibl in info.iterfind('bibl'):
54 tag = bibl.find('bib_tag')
55 txt = bibl.find('bib_txt')
56 tag = tag.text if tag is not None else None
57 txt = txt.text if txt is not None else None
58 # do something
59 for etym in info.iterfind('etym'):
60 # Not yet supported: if we ever encounter, warn.
61 skip_elem('etym')
62 for audit in info.iterfind('audit'):
63 upd_date = audit.find('upd_date').text
64 upd_detl = audit.find('upd_detl').text
65 # do something...
66 for sense in child.iterfind('sense'):
67 for stagk in sense.iterfind('stagk'):
68 text = stagk.text
69 # do something...
70 for stagr in sense.iterfind('stagr'):
71 text = stagr.text
72 # do something...
73 for pos in sense.iterfind('pos'):
74 text = pos.text # entity, right...?
75 # do something...
76 for xref in sense.iterfind('xref'):
77 text = xref.text # text w/ special format; just store for now
78 # do something...
79 for ant in sense.iterfind('ant'):
80 text = ant.text
81 # do something...
82 for field in sense.iterfind('field'):
83 text = field.text
84 # do something...
85 for misc in sense.iterfind('misc'):
86 text = misc.text
87 # do something...
88 for s_inf in sense.iterfind('s_inf'):
89 text = s_inf.text
90 # do something...
91 for lsource in sense.iterfind('lsource'):
92 text = lsource.text
93 lang = lsource.get(XML_LANG)
94 ltype = lsource.get('ls_type')
95 wasei = lsource.get('ls_wasei')
96 # do something...
97 for dial in sense.iterfind('dial'):
98 text = dial.text
99 # do something...
100 for gloss in sense.iterfind('gloss'):
101 lang = gloss.get(XML_LANG)
102 gender = gloss.get('g_gend')
103 text = gloss.text
104 # Do something...
105 for pri in gloss.iterfind('pri'):
106 # Not yet supported: if we ever encounter, warn.
107 skip_elem('pri')
108 for example in sense.iterfind('example'):
109 text = example.text
111 if __name__ == "__main__":
112 main()