doc finished, just tweaking some stuff
[nltk_ontology_framework.git] / test / standalone / symbol_ontology / symbol_ontology_test.py
blob518c96b03ccae827eb4089675df084e456b913b2
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 7, 2011
5 @author: mjacob
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree from terms discovered in a corpus of texts
9 which describe various symbols, pulled from http://www.symbols.com/
11 As that site is under copyright, the corpus I've been using to run this test
12 is not distributed with this source code.
13 '''
14 import os
15 import yaml
16 from glob import glob
17 import nltk
18 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
19 from nltk.tokenize.regexp import RegexpTokenizer
20 from nltk.chunk.regexp import RegexpParser
21 from mjacob.ontologybuilder import (OntologyBuilderFramework
22 , HighlightedTerms
23 , SimpleWordnetSynonyms
24 , SimpleConcepts
25 , SimpleConceptHierarchies
26 , SimpleChunkFilter
27 , SimpleSubterms
28 , NavigliTermFilter
30 from util.cached import Cacheable
32 VERSION = 4
33 NAME = 'symbol ontology generator V.%s' % (VERSION)
34 BASEDIR = os.path.split(os.path.abspath(__file__))[0]
35 CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache')
36 CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus')
37 CORPUSFILES = os.path.join(CORPUSDIR, '*.txt')
39 class SymbolOntologyBuilder(OntologyBuilderFramework,
40 HighlightedTerms,
41 SimpleWordnetSynonyms,
42 SimpleConcepts,
43 SimpleConceptHierarchies,
44 Cacheable,
45 dict):
46 """
47 SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
48 a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
49 and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
50 """
52 OTHER_CORPORA = "terms.other_corpora"
53 ALPHA = "terms.NavigliTermFilter.alpha"
54 THRESHOLD = "terms.NavigliTermFilter.threshold"
56 def __init__(self, parameter_file, only_do=None, ignore_cache=None):
57 """
58 @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
59 @param only_do: if specified, only perform steps in this collection
60 @param ignore_cache: if specified, ignore any cached results from
61 steps specified in this collection. note that any new results
62 will still be saved to cache, possibly overwriting existing results.
63 """
64 self.__set_parameters(parameter_file)
65 self.__init_cacheable(ignore_cache=ignore_cache)
66 self.__init_framework(only_do=only_do)
67 self.__init_terms()
68 self.__init_synonyms()
69 self.__init_concepts()
70 self.__init_concept_hierarchies()
72 def __init_cacheable(self, ignore_cache):
73 """
74 Initialize the object as cacheable
75 """
76 Cacheable.__init__(self,
77 os.path.join(CACHEDIR, NAME),
78 ignore_cache=ignore_cache,
79 debug=True)
81 def __set_parameters(self, parmeter_file):
82 """
83 set parameters from the @param parameter file
84 """
85 with open(parmeter_file) as file:
86 dict.__init__(self, yaml.load(file.read()))
88 def __init_framework(self, only_do=None):
89 """
90 initialize the framework
91 """
92 OntologyBuilderFramework.__init__(self,
93 NAME,
94 only_do=only_do)
96 def __init_terms(self):
97 """
98 Initialize the term extraction process.
100 This creates a regular and highlight corpus, identifies the other coprora
101 specified in the parameters file, and creates basic tagger, chunker,
102 chunk_filter, subterm, and term filter objects.
104 For the regular corpus, the words are found using the regexp
105 'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
106 which identifies image references, numbers, abbreviations,
107 apostrophes, and word characters.
109 For the highlight corpus, terms are found using the regexp
110 '(?<=<)[^>]+(?=>)'
111 which is basically anything between angle backets.
113 The tagger is a fairly sophisticated model that is part of C{nltk}
115 The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
117 The term filter is a C{NavigliTermFilter} which is parameterized by the values
118 in the parameter file.
121 corpus = PlaintextCorpusReader(root=CORPUSDIR,
122 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
123 word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
124 encoding='utf-8')
125 highlights = PlaintextCorpusReader(root=CORPUSDIR,
126 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
127 word_tokenizer=RegexpTokenizer(r'(?<=<)[^>]+(?=>)'),
128 encoding='utf-8')
129 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
130 tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
131 chunker = RegexpParser('NP: {<JJ|CD|N.*>+}')
132 chunk_filter = SimpleChunkFilter('NP', minlength=2)
133 subterms = SimpleSubterms(minlength=2)
134 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
135 self[SymbolOntologyBuilder.THRESHOLD])
137 HighlightedTerms.__init__(self,
138 corpus,
139 highlights,
140 other_corpora,
141 tagger,
142 chunker,
143 chunk_filter,
144 subterms,
145 term_filter)
147 def __init_synonyms(self):
148 SimpleWordnetSynonyms.__init__(self)
150 def __init_concepts(self):
151 SimpleConcepts.__init__(self)
153 def __init_concept_hierarchies(self):
154 SimpleConceptHierarchies.__init__(self)
156 if __name__ == "__main__":
157 builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml',
158 only_do=set((
159 OntologyBuilderFramework.TERMS,
160 OntologyBuilderFramework.SYNONYMS,
161 OntologyBuilderFramework.CONCEPTS,
162 OntologyBuilderFramework.CONCEPT_HIERARCHIES
163 )),
164 ignore_cache=set((
165 #OntologyBuilderFramework.TERMS,
166 #OntologyBuilderFramework.SYNONYMS,
167 #OntologyBuilderFramework.CONCEPTS,
168 #OntologyBuilderFramework.CONCEPT_HIERARCHIES,
170 state = builder.process()
171 #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS]))