2 #Copyright 2007 Sebastian Hagen
3 # This file is part of wordnet_tools.
5 # wordnet_tools is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License version 2
7 # as published by the Free Software Foundation
9 # wordnet_tools is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with wordnet_tools; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 # This program requires python >= 2.4.
20 # This program converts wordnet index/data file pairs into dict index/data
21 # files usable by dictd.
22 # This is basically a reimplementation of the wnfilter program by Rik Faith,
23 # which unfortunately doesn't work correctly for wordnet files in the newer
24 # formats. This version of wordnet_structures whould parse wordnet 2.1 files
25 # correctly, and create output very similar to what wnfilter would have
30 from textwrap
import TextWrapper
47 def __init__(self
, lemma
, category
, ptrs
, synsets
, tagsense_count
):
49 self
.category
= category
51 self
.synsets
= synsets
52 self
.tagsense_count
= tagsense_count
55 def build_from_line(cls
, line_data
, synset_map
):
56 line_split
= line_data
.split()
58 category
= category_map
[line_split
[1]]
59 synset_count
= int(line_split
[2],10)
60 ptr_count
= int(line_split
[3],10)
61 ptrs
= [line_split
[i
] for i
in range(3, 3+ptr_count
)]
62 tagsense_count
= int(line_split
[5 + ptr_count
],10)
63 synsets
= [synset_map
[int(line_split
[i
],10)] for i
in range(6 + ptr_count
, 6 + ptr_count
+ synset_count
)]
64 return cls(lemma
, category
, ptrs
, synsets
, tagsense_count
)
67 def build_from_file(cls
, f
, synset_map
, rv_base
=None):
74 if (line
.startswith(' ')):
76 wi
= cls
.build_from_line(line
, synset_map
)
77 word
= wi
.lemma
.lower()
84 return '%s%s' % (self
.__class
__.__name
__, (self
.lemma
, self
.category
, self
.ptrs
, self
.synsets
, self
.tagsense_count
))
87 class WordIndexDictFormatter(WordIndex
):
96 prefix_fmtf_line_first
= '%5s 1: '
97 prefix_fmtn_line_first
= ' '
98 prefix_fmtf_line_nonfirst
= '%5d: '
99 prefix_fmtn_line_nonfirst
= ' '
102 tw
= TextWrapper(width
=self
.LINE_WIDTH_MAX
,
103 initial_indent
=(self
.prefix_fmtf_line_first
% self
.category_map_rev
[self
.category
]),
104 subsequent_indent
=self
.prefix_fmtn_line_first
)
106 lines
= (tw
.wrap(self
.synsets
[0].dict_str()))
108 for synset
in self
.synsets
[1:]:
109 tw
= TextWrapper(width
=self
.LINE_WIDTH_MAX
,
110 initial_indent
=(self
.prefix_fmtf_line_nonfirst
% i
),
111 subsequent_indent
=self
.prefix_fmtn_line_nonfirst
)
112 lines
.extend(tw
.wrap(synset
.dict_str()))
114 return self
.linesep
.join(lines
)
118 def __init__(self
, offset
, ss_type
, words
, ptrs
, gloss
, frames
=()):
128 def build_from_line(cls
, line_data
):
129 line_split
= line_data
.split()
130 synset_offset
= int(line_split
[0],10)
131 ss_type
= category_map
[line_split
[2]]
132 word_count
= int(line_split
[3],16)
133 words
= [line_split
[i
] for i
in range(4, 4 + word_count
*2,2)]
134 ptr_count
= int(line_split
[4 + word_count
*2],10)
135 ptrs
= [(line_split
[i
], line_split
[i
+1], line_split
[i
+2], line_split
[i
+3]) for i
in range(5 + word_count
*2,4 + word_count
*2 + ptr_count
*4,4)]
137 tok
= line_split
[5 + word_count
*2 + ptr_count
*4]
138 base
= 6 + word_count
*2 + ptr_count
*4
140 frame_count
= int(tok
, 10)
141 frames
= [(int(line_split
[i
+1],10), int(line_split
[i
+2],16)) for i
in range(base
, base
+ frame_count
*3, 3)]
142 base
+= frame_count
*3 + 1
146 line_split2
= line_data
.split(None, base
)
147 if (len(line_split2
) < base
):
150 gloss
= line_split2
[-1]
152 return cls(synset_offset
, ss_type
, words
, ptrs
, gloss
, frames
)
155 def build_from_file(cls
, f
):
160 if (line
.startswith(' ')):
161 line_s
= line
.lstrip().rstrip('\n')
162 line_elements
= line_s
.split(None,1)
164 int(line_elements
[0])
167 if (len(line_elements
) == 1):
168 line_elements
.append('')
169 comments
.append(line_elements
[1])
171 synset
= cls
.build_from_line(line
.rstrip())
172 rv
[synset
.offset
] = synset
174 return (rv
, comments
)
178 if (len(self
.words
) > 1):
179 rv
+= ' [syn: %s]' % (', '.join([('{%s}' % word
) for word
in self
.words
]))
183 return '%s%s' % (self
.__class
__.__name
__, (self
.offset
, self
.type, self
.words
, self
.ptrs
, self
.gloss
, self
.frames
))
187 db_info_fmt
= '''This file was converted from the original database on:
188 %(conversion_datetime)s
190 The original data is available from:
193 The original data was distributed with the notice shown below. No
194 additional restrictions are claimed. Please redistribute this changed
195 version under the same conditions and restriction that apply to the
196 original version.\n\n
199 datetime_fmt
= '%Y-%m-%dT%H:%M:%S'
200 base64_map
= 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
202 def __init__(self
, wn_url
, desc_short
, desc_long
):
205 self
.desc_short
= desc_short
206 self
.desc_long
= desc_long
207 self
.wn_license
= None
209 def wn_dict_add(self
, file_index
, file_data
):
212 (synsets
, license_lines
) = Synset
.build_from_file(file_data
)
213 WordIndexDictFormatter
.build_from_file(file_index
, synsets
, self
.word_data
)
215 self
.wn_license
= '\n'.join(license_lines
) + '\n'
218 def base64_encode(cls
, i
):
219 """Encode a non-negative integer into a dictd compatible base64 string"""
221 raise ValueError('Value %r for i is negative' % (i
,))
231 d
= math
.floor(i
/ 64**e
)
232 rv
+= cls
.base64_map
[d
]
237 def dict_entry_write(cls
, file_index
, file_data
, key
, entry
, linesep
='\n'):
238 """Write a single dict entry for <key> to index and data files"""
239 entry_start
= file_data
.tell()
240 file_data
.write(entry
)
241 entry_len
= len(entry
)
242 file_index
.write('%s\t%s\t%s%s' % (key
, cls
.base64_encode(entry_start
),
243 cls
.base64_encode(entry_len
), linesep
))
245 def dict_generate(self
, file_index
, file_data
):
248 # The dictd file format is fairly iffy on the subject of special
249 # headwords: either dictd is buggy, or the manpage doesn't tell the whole
250 # story about the format.
251 # The upshot is that order of these entries in the index *matters*.
252 # Putting them at the beginning and in alphabetic order is afaict ok.
253 # Some other orders completely and quietly break the ability to look
254 # those headwords up.
255 # -- problem encountered with 1.10.2, at 2007-08-05.
256 file_data
.write('\n')
258 conversion_datetime
= datetime
.datetime
.now().strftime(self
.datetime_fmt
)
259 wn_license
= self
.wn_license
260 self
.dict_entry_write(file_index
, file_data
, '00-database-info', '00-database-info\n%s\n' % (self
.db_info_fmt
% vars()))
261 self
.dict_entry_write(file_index
, file_data
, '00-database-long', '00-database-long\n%s\n' % self
.desc_long
)
262 self
.dict_entry_write(file_index
, file_data
, '00-database-short', '00-database-short\n%s\n' % self
.desc_short
)
263 self
.dict_entry_write(file_index
, file_data
, '00-database-url', '00-database-url\n%s\n' % self
.wn_url
)
266 words
= list(self
.word_data
.keys())
269 for wi
in self
.word_data
[word
]:
271 # Use case-sensitivity information of first entry of first synset that
272 # matches this word case-insensitively
273 for synset
in wi
.synsets
:
274 for ss_word
in synset
.words
:
275 if (ss_word
.lower() == word_cs
.lower()):
286 for wi
in self
.word_data
[word
]:
287 outstr
+= wi
.dict_str() + '\n'
289 outstr
= '%s%s%s' % (word_cs
, wi
.linesep
, outstr
)
290 self
.dict_entry_write(file_index
, file_data
, word_cs
, outstr
, wi
.linesep
)
292 file_index
.truncate()
296 if (__name__
== '__main__'):
298 op
= optparse
.OptionParser(usage
='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
299 op
.add_option('-i', '--outindex', dest
='oi', default
='wn.index', help='filename of index file to write to')
300 op
.add_option('-d', '--outdata', dest
='od', default
='wn.dict', help='filename of data file to write to')
301 op
.add_option('--wn_url', dest
='wn_url', default
='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
302 op
.add_option('--db_desc_short', dest
='desc_short', default
=' WordNet (r) 2.1 (2005)', help='short dict DB description')
303 op
.add_option('--db_desc_long', dest
='desc_long', default
=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
305 (options
, args
) = op
.parse_args()
307 wnd
= WordnetDict(wn_url
=options
.wn_url
, desc_short
=options
.desc_short
, desc_long
=options
.desc_long
)
309 for i
in range(0,len(args
),2):
310 print('Opening index file %r...' % args
[i
])
311 file_index
= open(args
[i
])
312 print('Opening data file %r...' % args
[i
+1])
313 file_data
= open(args
[i
+1])
314 print('Parsing index file and data file...')
315 wnd
.wn_dict_add(file_index
, file_data
)
317 print('All input files parsed. Writing output to index file %r and data file %r.' % (options
.oi
, options
.od
))
319 wnd
.dict_generate(open(options
.oi
, 'w'),open(options
.od
, 'w'))