refactoring complete, but untested
[nltk_ontology_framework.git] / util / test_tokenize_corpus.py
blob28ffc873eeccaf226f7db87cb43ee4372b11414b
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 3, 2011
5 @author: mjacob
6 '''
7 import os
8 import re
9 import yaml
10 import nltk
11 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktWordTokenizer
12 from nltk.chunk.regexp import RegexpParser
13 from nltk.corpus import wordnet as wn
14 from itertools import chain
16 XML_ILLEGAL = re.compile(u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
17 u'|' + \
18 u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
19 (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
20 unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
21 unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff)))
23 KILL_END_PUNCT = re.compile('^(.*\w)\W*$')
24 HIGHLIGHTS = re.compile('⟦([^⟧]*)⟧')
25 PUNCT = re.compile('[\W\']')
26 SPACE = re.compile('^\s*$')
28 chunker_grammar = """NP: {<JJ|CD|N.*>+}"""
30 corpus_dir = '../symbol_corpus'
32 files = [(open(os.path.join(corpus_dir, x)).read(), x) for x in os.listdir(corpus_dir) if x.endswith('.txt')]
34 longest_file = max(files, key=lambda x: len(x[0]))[0]
35 print len(files)
36 print len(longest_file)
38 sentence_tokenizer = PunktSentenceTokenizer(longest_file)
39 word_tokenizer = PunktWordTokenizer()
40 chunker = RegexpParser(chunker_grammar)
42 def split_term(term):
43 tokens = filter(lambda x: not SPACE.match(x), term.split(' '))
44 for i in range(len(tokens)):
45 for j in range(i+1, len(tokens)):
46 yield " ".join(tokens[i:j])
48 def unhighlight(sentence):
49 return (sentence.replace('⟦', '').replace('⟧', ''), HIGHLIGHTS.findall(sentence))
51 statistics = {}
52 for file, filename in files:
53 tokencounts = {}
54 for sentence in sentence_tokenizer.tokenize(file):
55 unhighlighted, highlights = unhighlight(sentence)
56 unpunctuated = KILL_END_PUNCT.match(unhighlighted)
57 if unpunctuated:
58 unhighlighted = unpunctuated.group(1)
59 else:
60 print "empty sentence in %s: %s" % (filename, sentence)
61 continue
63 tokens = list(XML_ILLEGAL.sub('', word) for word in word_tokenizer.tokenize(unhighlighted))
64 tagged = nltk.pos_tag(tokens)
65 chunked = chunker.parse(tagged)
66 np_chunks = [" ".join(y[0] for y in x.leaves()) for x in chunked.subtrees(lambda x: x.node == 'NP')]
67 for chunk in chain(np_chunks, highlights):
68 term = PUNCT.sub(' ', chunk).lstrip().rstrip().lower()
69 if not term:
70 continue
72 for subterm in split_term(term):
73 if subterm in tokencounts:
74 tokencounts[subterm] += 1
75 else:
76 tokencounts[subterm] = 1
78 #for chunk in list(tokencounts.keys()):
79 # if wn.synsets(chunk):
80 # del tokencounts[chunk]
82 for term, count in tokencounts.items():
83 if term in statistics:
84 statistics[term].append(count)
85 else:
86 statistics[term] = [count]
88 """ remove singletons """
89 for term, count in statistics.items():
90 if len(count) == 1 and count[0] == 1:
91 del statistics[term]
93 yaml.dump(statistics, open(os.path.join(corpus_dir, 'statistics'), 'w'))