1 # This Python file uses the following encoding: utf-8
11 from nltk
.tokenize
.punkt
import PunktSentenceTokenizer
, PunktWordTokenizer
12 from nltk
.chunk
.regexp
import RegexpParser
13 from nltk
.corpus
import wordnet
as wn
14 from itertools
import chain
16 XML_ILLEGAL
= re
.compile(u
'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \
18 u
'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \
19 (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
20 unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff),
21 unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff)))
23 KILL_END_PUNCT
= re
.compile('^(.*\w)\W*$')
24 HIGHLIGHTS
= re
.compile('⟦([^⟧]*)⟧')
25 PUNCT
= re
.compile('[\W\']')
26 SPACE
= re
.compile('^\s*$')
28 chunker_grammar
= """NP: {<JJ|CD|N.*>+}"""
30 corpus_dir
= '../symbol_corpus'
32 files
= [(open(os
.path
.join(corpus_dir
, x
)).read(), x
) for x
in os
.listdir(corpus_dir
) if x
.endswith('.txt')]
34 longest_file
= max(files
, key
=lambda x
: len(x
[0]))[0]
36 print len(longest_file
)
38 sentence_tokenizer
= PunktSentenceTokenizer(longest_file
)
39 word_tokenizer
= PunktWordTokenizer()
40 chunker
= RegexpParser(chunker_grammar
)
43 tokens
= filter(lambda x
: not SPACE
.match(x
), term
.split(' '))
44 for i
in range(len(tokens
)):
45 for j
in range(i
+1, len(tokens
)):
46 yield " ".join(tokens
[i
:j
])
48 def unhighlight(sentence
):
49 return (sentence
.replace('⟦', '').replace('⟧', ''), HIGHLIGHTS
.findall(sentence
))
52 for file, filename
in files
:
54 for sentence
in sentence_tokenizer
.tokenize(file):
55 unhighlighted
, highlights
= unhighlight(sentence
)
56 unpunctuated
= KILL_END_PUNCT
.match(unhighlighted
)
58 unhighlighted
= unpunctuated
.group(1)
60 print "empty sentence in %s: %s" % (filename
, sentence
)
63 tokens
= list(XML_ILLEGAL
.sub('', word
) for word
in word_tokenizer
.tokenize(unhighlighted
))
64 tagged
= nltk
.pos_tag(tokens
)
65 chunked
= chunker
.parse(tagged
)
66 np_chunks
= [" ".join(y
[0] for y
in x
.leaves()) for x
in chunked
.subtrees(lambda x
: x
.node
== 'NP')]
67 for chunk
in chain(np_chunks
, highlights
):
68 term
= PUNCT
.sub(' ', chunk
).lstrip().rstrip().lower()
72 for subterm
in split_term(term
):
73 if subterm
in tokencounts
:
74 tokencounts
[subterm
] += 1
76 tokencounts
[subterm
] = 1
78 #for chunk in list(tokencounts.keys()):
79 # if wn.synsets(chunk):
80 # del tokencounts[chunk]
82 for term
, count
in tokencounts
.items():
83 if term
in statistics
:
84 statistics
[term
].append(count
)
86 statistics
[term
] = [count
]
88 """ remove singletons """
89 for term
, count
in statistics
.items():
90 if len(count
) == 1 and count
[0] == 1:
93 yaml
.dump(statistics
, open(os
.path
.join(corpus_dir
, 'statistics'), 'w'))