pkgs/servers/dict/wordnet_structures.py

   1 #!/usr/bin/env python3
   2 #Copyright 2007 Sebastian Hagen
   3 # This file is part of wordnet_tools.
   4
   5 # wordnet_tools is free software; you can redistribute it and/or modify
   6 # it under the terms of the GNU General Public License version 2
   7 # as published by the Free Software Foundation
   8
   9 # wordnet_tools is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13
  14 # You should have received a copy of the GNU General Public License
  15 # along with wordnet_tools; if not, write to the Free Software
  16 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17
  18 # This program requires python >= 2.4.
  19
  20 # This program converts wordnet index/data file pairs into dict index/data
  21 # files usable by dictd.
  22 # This is basically a reimplementation of the wnfilter program by Rik Faith,
  23 # which unfortunately doesn't work correctly for wordnet files in the newer
  24 # formats. This version of wordnet_structures whould parse wordnet 2.1 files
  25 # correctly, and create output very similar to what wnfilter would have
  26 # written.
  27
  28 import datetime
  29 import math
  30 from textwrap import TextWrapper
  31
  32 CAT_ADJECTIVE = 0
  33 CAT_ADVERB = 1
  34 CAT_NOUN = 2
  35 CAT_VERB = 3
  36
  37 category_map = {
  38    'n': CAT_NOUN,
  39    'v': CAT_VERB,
  40    'a': CAT_ADJECTIVE,
  41    's': CAT_ADJECTIVE,
  42    'r': CAT_ADVERB
  43 }
  44
  45
  46 class WordIndex:
  47    def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
  48       self.lemma = lemma
  49       self.category = category
  50       self.ptrs = ptrs
  51       self.synsets = synsets
  52       self.tagsense_count = tagsense_count
  53
  54    @classmethod
  55    def build_from_line(cls, line_data, synset_map):
  56       line_split = line_data.split()
  57       lemma = line_split[0]
  58       category = category_map[line_split[1]]
  59       synset_count = int(line_split[2],10)
  60       ptr_count = int(line_split[3],10)
  61       ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
  62       tagsense_count = int(line_split[5 + ptr_count],10)
  63       synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
  64       return cls(lemma, category, ptrs, synsets, tagsense_count)
  65
  66    @classmethod
  67    def build_from_file(cls, f, synset_map, rv_base=None):
  68       if (rv_base is None):
  69          rv = {}
  70       else:
  71          rv = rv_base
  72
  73       for line in f:
  74          if (line.startswith('  ')):
  75             continue
  76          wi = cls.build_from_line(line, synset_map)
  77          word = wi.lemma.lower()
  78          if not (word in rv):
  79             rv[word] = []
  80          rv[word].append(wi)
  81       return rv
  82
  83    def __repr__(self):
  84       return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
  85
  86
  87 class WordIndexDictFormatter(WordIndex):
  88    category_map_rev = {
  89       CAT_NOUN: 'n',
  90       CAT_VERB: 'v',
  91       CAT_ADJECTIVE: 'adj',
  92       CAT_ADVERB: 'adv'
  93    }
  94    linesep = '\n'
  95    LINE_WIDTH_MAX = 68
  96    prefix_fmtf_line_first = '%5s 1: '
  97    prefix_fmtn_line_first = '         '
  98    prefix_fmtf_line_nonfirst = '%5d: '
  99    prefix_fmtn_line_nonfirst = '       '
 100
 101    def dict_str(self):
 102       tw = TextWrapper(width=self.LINE_WIDTH_MAX,
 103          initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
 104          subsequent_indent=self.prefix_fmtn_line_first)
 105
 106       lines = (tw.wrap(self.synsets[0].dict_str()))
 107       i = 2
 108       for synset in self.synsets[1:]:
 109          tw = TextWrapper(width=self.LINE_WIDTH_MAX,
 110             initial_indent=(self.prefix_fmtf_line_nonfirst % i),
 111             subsequent_indent=self.prefix_fmtn_line_nonfirst)
 112          lines.extend(tw.wrap(synset.dict_str()))
 113          i += 1
 114       return self.linesep.join(lines)
 115
 116
 117 class Synset:
 118    def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
 119       self.offset = offset
 120       self.type = ss_type
 121       self.words = words
 122       self.ptrs = ptrs
 123       self.gloss = gloss
 124       self.frames = frames
 125       self.comments = []
 126
 127    @classmethod
 128    def build_from_line(cls, line_data):
 129       line_split = line_data.split()
 130       synset_offset = int(line_split[0],10)
 131       ss_type = category_map[line_split[2]]
 132       word_count = int(line_split[3],16)
 133       words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
 134       ptr_count = int(line_split[4 + word_count*2],10)
 135       ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
 136
 137       tok = line_split[5 + word_count*2 + ptr_count*4]
 138       base = 6 + word_count*2 + ptr_count*4
 139       if (tok != '|'):
 140          frame_count = int(tok, 10)
 141          frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
 142          base += frame_count*3 + 1
 143       else:
 144          frames = []
 145
 146       line_split2 = line_data.split(None, base)
 147       if (len(line_split2) < base):
 148          gloss = None
 149       else:
 150          gloss = line_split2[-1]
 151
 152       return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
 153
 154    @classmethod
 155    def build_from_file(cls, f):
 156       rv = {}
 157       comments = []
 158
 159       for line in f:
 160          if (line.startswith('  ')):
 161             line_s = line.lstrip().rstrip('\n')
 162             line_elements = line_s.split(None,1)
 163             try:
 164                int(line_elements[0])
 165             except ValueError:
 166                continue
 167             if (len(line_elements) == 1):
 168                line_elements.append('')
 169             comments.append(line_elements[1])
 170             continue
 171          synset = cls.build_from_line(line.rstrip())
 172          rv[synset.offset] = synset
 173
 174       return (rv, comments)
 175
 176    def dict_str(self):
 177       rv = self.gloss
 178       if (len(self.words) > 1):
 179          rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
 180       return rv
 181
 182    def __repr__(self):
 183       return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
 184
 185
 186 class WordnetDict:
 187    db_info_fmt = '''This file was converted from the original database on:
 188           %(conversion_datetime)s
 189
 190 The original data is available from:
 191      %(wn_url)s
 192
 193 The original data was distributed with the notice shown below. No
 194 additional restrictions are claimed.  Please redistribute this changed
 195 version under the same conditions and restriction that apply to the
 196 original version.\n\n
 197 %(wn_license)s'''
 198
 199    datetime_fmt = '%Y-%m-%dT%H:%M:%S'
 200    base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
 201
 202    def __init__(self, wn_url, desc_short, desc_long):
 203       self.word_data = {}
 204       self.wn_url = wn_url
 205       self.desc_short = desc_short
 206       self.desc_long = desc_long
 207       self.wn_license = None
 208
 209    def wn_dict_add(self, file_index, file_data):
 210       file_data.seek(0)
 211       file_index.seek(0)
 212       (synsets, license_lines) = Synset.build_from_file(file_data)
 213       WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
 214       if (license_lines):
 215          self.wn_license = '\n'.join(license_lines) + '\n'
 216
 217    @classmethod
 218    def base64_encode(cls, i):
 219       """Encode a non-negative integer into a dictd compatible base64 string"""
 220       if (i < 0):
 221          raise ValueError('Value %r for i is negative' % (i,))
 222       r = 63
 223       e = 1
 224       while (r < i):
 225          e += 1
 226          r = 64**e - 1
 227
 228       rv = ''
 229       while (e > 0):
 230          e -= 1
 231          d = math.floor(i / 64**e)
 232          rv += cls.base64_map[d]
 233          i = i % (64**e)
 234       return rv
 235
 236    @classmethod
 237    def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
 238       """Write a single dict entry for <key> to index and data files"""
 239       entry_start = file_data.tell()
 240       file_data.write(entry)
 241       entry_len = len(entry)
 242       file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
 243             cls.base64_encode(entry_len), linesep))
 244
 245    def dict_generate(self, file_index, file_data):
 246       file_index.seek(0)
 247       file_data.seek(0)
 248       # The dictd file format is fairly iffy on the subject of special
 249       # headwords: either dictd is buggy, or the manpage doesn't tell the whole
 250       # story about the format.
 251       # The upshot is that order of these entries in the index *matters*.
 252       # Putting them at the beginning and in alphabetic order is afaict ok.
 253       # Some other orders completely and quietly break the ability to look
 254       # those headwords up.
 255       # -- problem encountered with 1.10.2, at 2007-08-05.
 256       file_data.write('\n')
 257       wn_url = self.wn_url
 258       conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
 259       wn_license = self.wn_license
 260       self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
 261       self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
 262       self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
 263       self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
 264
 265
 266       words = list(self.word_data.keys())
 267       words.sort()
 268       for word in words:
 269          for wi in self.word_data[word]:
 270             word_cs = word
 271             # Use case-sensitivity information of first entry of first synset that
 272             # matches this word case-insensitively
 273             for synset in wi.synsets:
 274                for ss_word in synset.words:
 275                   if (ss_word.lower() == word_cs.lower()):
 276                      word_cs = ss_word
 277                      break
 278                else:
 279                   continue
 280                break
 281             else:
 282                continue
 283             break
 284
 285          outstr = ''
 286          for wi in self.word_data[word]:
 287             outstr += wi.dict_str() + '\n'
 288
 289          outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
 290          self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
 291
 292       file_index.truncate()
 293       file_data.truncate()
 294
 295
 296 if (__name__ == '__main__'):
 297    import optparse
 298    op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
 299    op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
 300    op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
 301    op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
 302    op.add_option('--db_desc_short', dest='desc_short', default='     WordNet (r) 2.1 (2005)', help='short dict DB description')
 303    op.add_option('--db_desc_long', dest='desc_long', default='    WordNet (r): A Lexical Database for English from the\n     Cognitive Science Laboratory at Princeton University', help='long dict DB description')
 304
 305    (options, args) = op.parse_args()
 306
 307    wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
 308
 309    for i in range(0,len(args),2):
 310       print('Opening index file %r...' % args[i])
 311       file_index = open(args[i])
 312       print('Opening data file %r...' % args[i+1])
 313       file_data = open(args[i+1])
 314       print('Parsing index file and data file...')
 315       wnd.wn_dict_add(file_index, file_data)
 316
 317    print('All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od))
 318
 319    wnd.dict_generate(open(options.oi, 'w'),open(options.od, 'w'))
 320    print('All done.')