util/test_tokenize_corpus.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 3, 2011
   4
   5 @author: mjacob
   6 '''
   7 import os
   8 import re
   9 import yaml
  10 import nltk
  11 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktWordTokenizer
  12 from nltk.chunk.regexp import RegexpParser
  13 from nltk.corpus import wordnet as wn
  14 from itertools import chain
  15
  16 XML_ILLEGAL = re.compile(u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
  17                  u'|' + \
  18                  u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
  19                   (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
  20                    unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
  21                    unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff)))
  22
  23 KILL_END_PUNCT = re.compile('^(.*\w)\W*$')
  24 HIGHLIGHTS = re.compile('⟦([^⟧]*)⟧')
  25 PUNCT = re.compile('[\W\']')
  26 SPACE = re.compile('^\s*$')
  27
  28 chunker_grammar = """NP: {<JJ|CD|N.*>+}"""
  29
  30 corpus_dir = '../symbol_corpus'
  31
  32 files = [(open(os.path.join(corpus_dir, x)).read(), x) for x in os.listdir(corpus_dir) if x.endswith('.txt')]
  33
  34 longest_file = max(files, key=lambda x: len(x[0]))[0]
  35 print len(files)
  36 print len(longest_file)
  37
  38 sentence_tokenizer = PunktSentenceTokenizer(longest_file)
  39 word_tokenizer = PunktWordTokenizer()
  40 chunker = RegexpParser(chunker_grammar)
  41
  42 def split_term(term):
  43     tokens = filter(lambda x: not SPACE.match(x), term.split(' '))
  44     for i in range(len(tokens)):
  45         for j in range(i+1, len(tokens)):
  46             yield " ".join(tokens[i:j])
  47
  48 def unhighlight(sentence):
  49     return (sentence.replace('⟦', '').replace('⟧', ''), HIGHLIGHTS.findall(sentence))
  50
  51 statistics = {}
  52 for file, filename in files:
  53     tokencounts = {}
  54     for sentence in sentence_tokenizer.tokenize(file):
  55         unhighlighted, highlights = unhighlight(sentence)
  56         unpunctuated = KILL_END_PUNCT.match(unhighlighted)
  57         if unpunctuated:
  58             unhighlighted = unpunctuated.group(1)
  59         else:
  60             print "empty sentence in %s: %s" % (filename, sentence)
  61             continue
  62
  63         tokens = list(XML_ILLEGAL.sub('', word) for word in word_tokenizer.tokenize(unhighlighted))
  64         tagged = nltk.pos_tag(tokens)
  65         chunked = chunker.parse(tagged)
  66         np_chunks = [" ".join(y[0] for y in x.leaves()) for x in chunked.subtrees(lambda x: x.node == 'NP')]
  67         for chunk in chain(np_chunks, highlights):
  68             term = PUNCT.sub(' ', chunk).lstrip().rstrip().lower()
  69             if not term:
  70                 continue
  71
  72             for subterm in split_term(term):
  73                 if subterm in tokencounts:
  74                     tokencounts[subterm] += 1
  75                 else:
  76                     tokencounts[subterm] = 1
  77
  78         #for chunk in list(tokencounts.keys()):
  79         #    if wn.synsets(chunk):
  80         #        del tokencounts[chunk]
  81
  82     for term, count in tokencounts.items():
  83         if term in statistics:
  84             statistics[term].append(count)
  85         else:
  86             statistics[term] = [count]
  87
  88 """ remove singletons """
  89 for term, count in statistics.items():
  90     if len(count) == 1 and count[0] == 1:
  91         del statistics[term]
  92
  93 yaml.dump(statistics, open(os.path.join(corpus_dir, 'statistics'), 'w'))