Merge pull request #330634 from r-ryantm/auto-update/circumflex
[NixPkgs.git] / pkgs / servers / dict / wordnet_structures.py
blobe5c80b968fc2301bdd47aad5f66aa6c1bf22a6c1
1 #!/usr/bin/env python3
2 #Copyright 2007 Sebastian Hagen
3 # This file is part of wordnet_tools.
5 # wordnet_tools is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License version 2
7 # as published by the Free Software Foundation
9 # wordnet_tools is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
14 # You should have received a copy of the GNU General Public License
15 # along with wordnet_tools; if not, write to the Free Software
16 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 # This program requires python >= 2.4.
20 # This program converts wordnet index/data file pairs into dict index/data
21 # files usable by dictd.
22 # This is basically a reimplementation of the wnfilter program by Rik Faith,
23 # which unfortunately doesn't work correctly for wordnet files in the newer
24 # formats. This version of wordnet_structures whould parse wordnet 2.1 files
25 # correctly, and create output very similar to what wnfilter would have
26 # written.
28 import datetime
29 import math
30 from textwrap import TextWrapper
32 CAT_ADJECTIVE = 0
33 CAT_ADVERB = 1
34 CAT_NOUN = 2
35 CAT_VERB = 3
37 category_map = {
38 'n': CAT_NOUN,
39 'v': CAT_VERB,
40 'a': CAT_ADJECTIVE,
41 's': CAT_ADJECTIVE,
42 'r': CAT_ADVERB
46 class WordIndex:
47 def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
48 self.lemma = lemma
49 self.category = category
50 self.ptrs = ptrs
51 self.synsets = synsets
52 self.tagsense_count = tagsense_count
54 @classmethod
55 def build_from_line(cls, line_data, synset_map):
56 line_split = line_data.split()
57 lemma = line_split[0]
58 category = category_map[line_split[1]]
59 synset_count = int(line_split[2],10)
60 ptr_count = int(line_split[3],10)
61 ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
62 tagsense_count = int(line_split[5 + ptr_count],10)
63 synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
64 return cls(lemma, category, ptrs, synsets, tagsense_count)
66 @classmethod
67 def build_from_file(cls, f, synset_map, rv_base=None):
68 if (rv_base is None):
69 rv = {}
70 else:
71 rv = rv_base
73 for line in f:
74 if (line.startswith(' ')):
75 continue
76 wi = cls.build_from_line(line, synset_map)
77 word = wi.lemma.lower()
78 if not (word in rv):
79 rv[word] = []
80 rv[word].append(wi)
81 return rv
83 def __repr__(self):
84 return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
87 class WordIndexDictFormatter(WordIndex):
88 category_map_rev = {
89 CAT_NOUN: 'n',
90 CAT_VERB: 'v',
91 CAT_ADJECTIVE: 'adj',
92 CAT_ADVERB: 'adv'
94 linesep = '\n'
95 LINE_WIDTH_MAX = 68
96 prefix_fmtf_line_first = '%5s 1: '
97 prefix_fmtn_line_first = ' '
98 prefix_fmtf_line_nonfirst = '%5d: '
99 prefix_fmtn_line_nonfirst = ' '
101 def dict_str(self):
102 tw = TextWrapper(width=self.LINE_WIDTH_MAX,
103 initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
104 subsequent_indent=self.prefix_fmtn_line_first)
106 lines = (tw.wrap(self.synsets[0].dict_str()))
107 i = 2
108 for synset in self.synsets[1:]:
109 tw = TextWrapper(width=self.LINE_WIDTH_MAX,
110 initial_indent=(self.prefix_fmtf_line_nonfirst % i),
111 subsequent_indent=self.prefix_fmtn_line_nonfirst)
112 lines.extend(tw.wrap(synset.dict_str()))
113 i += 1
114 return self.linesep.join(lines)
117 class Synset:
118 def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
119 self.offset = offset
120 self.type = ss_type
121 self.words = words
122 self.ptrs = ptrs
123 self.gloss = gloss
124 self.frames = frames
125 self.comments = []
127 @classmethod
128 def build_from_line(cls, line_data):
129 line_split = line_data.split()
130 synset_offset = int(line_split[0],10)
131 ss_type = category_map[line_split[2]]
132 word_count = int(line_split[3],16)
133 words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
134 ptr_count = int(line_split[4 + word_count*2],10)
135 ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
137 tok = line_split[5 + word_count*2 + ptr_count*4]
138 base = 6 + word_count*2 + ptr_count*4
139 if (tok != '|'):
140 frame_count = int(tok, 10)
141 frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
142 base += frame_count*3 + 1
143 else:
144 frames = []
146 line_split2 = line_data.split(None, base)
147 if (len(line_split2) < base):
148 gloss = None
149 else:
150 gloss = line_split2[-1]
152 return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
154 @classmethod
155 def build_from_file(cls, f):
156 rv = {}
157 comments = []
159 for line in f:
160 if (line.startswith(' ')):
161 line_s = line.lstrip().rstrip('\n')
162 line_elements = line_s.split(None,1)
163 try:
164 int(line_elements[0])
165 except ValueError:
166 continue
167 if (len(line_elements) == 1):
168 line_elements.append('')
169 comments.append(line_elements[1])
170 continue
171 synset = cls.build_from_line(line.rstrip())
172 rv[synset.offset] = synset
174 return (rv, comments)
176 def dict_str(self):
177 rv = self.gloss
178 if (len(self.words) > 1):
179 rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
180 return rv
182 def __repr__(self):
183 return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
186 class WordnetDict:
187 db_info_fmt = '''This file was converted from the original database on:
188 %(conversion_datetime)s
190 The original data is available from:
191 %(wn_url)s
193 The original data was distributed with the notice shown below. No
194 additional restrictions are claimed. Please redistribute this changed
195 version under the same conditions and restriction that apply to the
196 original version.\n\n
197 %(wn_license)s'''
199 datetime_fmt = '%Y-%m-%dT%H:%M:%S'
200 base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
202 def __init__(self, wn_url, desc_short, desc_long):
203 self.word_data = {}
204 self.wn_url = wn_url
205 self.desc_short = desc_short
206 self.desc_long = desc_long
207 self.wn_license = None
209 def wn_dict_add(self, file_index, file_data):
210 file_data.seek(0)
211 file_index.seek(0)
212 (synsets, license_lines) = Synset.build_from_file(file_data)
213 WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
214 if (license_lines):
215 self.wn_license = '\n'.join(license_lines) + '\n'
217 @classmethod
218 def base64_encode(cls, i):
219 """Encode a non-negative integer into a dictd compatible base64 string"""
220 if (i < 0):
221 raise ValueError('Value %r for i is negative' % (i,))
222 r = 63
223 e = 1
224 while (r < i):
225 e += 1
226 r = 64**e - 1
228 rv = ''
229 while (e > 0):
230 e -= 1
231 d = math.floor(i / 64**e)
232 rv += cls.base64_map[d]
233 i = i % (64**e)
234 return rv
236 @classmethod
237 def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
238 """Write a single dict entry for <key> to index and data files"""
239 entry_start = file_data.tell()
240 file_data.write(entry)
241 entry_len = len(entry)
242 file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
243 cls.base64_encode(entry_len), linesep))
245 def dict_generate(self, file_index, file_data):
246 file_index.seek(0)
247 file_data.seek(0)
248 # The dictd file format is fairly iffy on the subject of special
249 # headwords: either dictd is buggy, or the manpage doesn't tell the whole
250 # story about the format.
251 # The upshot is that order of these entries in the index *matters*.
252 # Putting them at the beginning and in alphabetic order is afaict ok.
253 # Some other orders completely and quietly break the ability to look
254 # those headwords up.
255 # -- problem encountered with 1.10.2, at 2007-08-05.
256 file_data.write('\n')
257 wn_url = self.wn_url
258 conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
259 wn_license = self.wn_license
260 self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
261 self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
262 self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
263 self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
266 words = list(self.word_data.keys())
267 words.sort()
268 for word in words:
269 for wi in self.word_data[word]:
270 word_cs = word
271 # Use case-sensitivity information of first entry of first synset that
272 # matches this word case-insensitively
273 for synset in wi.synsets:
274 for ss_word in synset.words:
275 if (ss_word.lower() == word_cs.lower()):
276 word_cs = ss_word
277 break
278 else:
279 continue
280 break
281 else:
282 continue
283 break
285 outstr = ''
286 for wi in self.word_data[word]:
287 outstr += wi.dict_str() + '\n'
289 outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
290 self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
292 file_index.truncate()
293 file_data.truncate()
296 if (__name__ == '__main__'):
297 import optparse
298 op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
299 op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
300 op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
301 op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
302 op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description')
303 op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description')
305 (options, args) = op.parse_args()
307 wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
309 for i in range(0,len(args),2):
310 print('Opening index file %r...' % args[i])
311 file_index = open(args[i])
312 print('Opening data file %r...' % args[i+1])
313 file_data = open(args[i+1])
314 print('Parsing index file and data file...')
315 wnd.wn_dict_add(file_index, file_data)
317 print('All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od))
319 wnd.dict_generate(open(options.oi, 'w'),open(options.od, 'w'))
320 print('All done.')