5 # lxml normalizes XML namespaces... so for xml:lang, we use the
7 XML_LANG
= '{http://www.w3.org/XML/1998/namespace}lang' # xml:lang
10 def skip_elem(elem_name
):
11 warnings
.warn("Encountered unsuppored element <%s>. "
12 "<%s> elements will be skipped." % (elem_name
, elem_name
))
15 parser
= etree
.XMLParser(resolve_entities
=False)
17 with
open(sys
.argv
[1]) as infile
:
18 et
= etree
.ElementTree(file=infile
, parser
=parser
)
20 for child
in root
.iterchildren():
21 ent_seq
= int(child
.find('ent_seq').text
)
22 for k_ele
in child
.iterfind('k_ele'):
23 blob
= k_ele
.find('keb').text
24 #print u"k_ele.keb:", blob
25 for ke_inf
in k_ele
.iterfind('ke_inf'):
27 #print u"k_ele.ke_inf:", info
28 for ke_pri
in k_ele
.iterfind('ke_pri'):
29 priority
= ke_pri
.text
30 #print u"k_ele.ke_pri:", priority
31 for r_ele
in child
.iterfind('r_ele'):
32 blob
= r_ele
.find('reb').text
33 #print u"r_ele.reb:", blob
34 nokanji
= True if r_ele
.find('re_nokanji') is not None \
36 #print u"r_ele.nokanji:", nokanji
37 for re_restr
in r_ele
.iterfind('re_restr'):
39 #print u"r_ele.re_restr:", restr
40 for re_inf
in r_ele
.iterfind('re_inf'):
42 #print u"r_ele.re_inf:", info
43 for re_pri
in r_ele
.iterfind('re_pri'):
44 priority
= re_pri
.text
45 #print u"r_ele.re_pri:", priority
46 info
= child
.find('info')
48 for links
in info
.iterfind('links'):
49 tag
= links
.find('link_tag').text
50 desc
= links
.find('link_desc').text
51 uri
= links
.find('link_uri').text
53 for bibl
in info
.iterfind('bibl'):
54 tag
= bibl
.find('bib_tag')
55 txt
= bibl
.find('bib_txt')
56 tag
= tag
.text
if tag
is not None else None
57 txt
= txt
.text
if txt
is not None else None
59 for etym
in info
.iterfind('etym'):
60 # Not yet supported: if we ever encounter, warn.
62 for audit
in info
.iterfind('audit'):
63 upd_date
= audit
.find('upd_date').text
64 upd_detl
= audit
.find('upd_detl').text
66 for sense
in child
.iterfind('sense'):
67 for stagk
in sense
.iterfind('stagk'):
70 for stagr
in sense
.iterfind('stagr'):
73 for pos
in sense
.iterfind('pos'):
74 text
= pos
.text
# entity, right...?
76 for xref
in sense
.iterfind('xref'):
77 text
= xref
.text
# text w/ special format; just store for now
79 for ant
in sense
.iterfind('ant'):
82 for field
in sense
.iterfind('field'):
85 for misc
in sense
.iterfind('misc'):
88 for s_inf
in sense
.iterfind('s_inf'):
91 for lsource
in sense
.iterfind('lsource'):
93 lang
= lsource
.get(XML_LANG
)
94 ltype
= lsource
.get('ls_type')
95 wasei
= lsource
.get('ls_wasei')
97 for dial
in sense
.iterfind('dial'):
100 for gloss
in sense
.iterfind('gloss'):
101 lang
= gloss
.get(XML_LANG
)
102 gender
= gloss
.get('g_gend')
105 for pri
in gloss
.iterfind('pri'):
106 # Not yet supported: if we ever encounter, warn.
108 for example
in sense
.iterfind('example'):
111 if __name__
== "__main__":