nearly everything good to go, mod minor hierarchy bug
[nltk_ontology_framework.git] / test / standalone / symbol_ontology / symbol_ontology_test.py
blob21630a4626e055683e5ff242f73442c7c3d58d6a
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 7, 2011
5 @author: mjacob
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree from terms discovered in a corpus of texts
9 which describe various symbols, pulled from http://www.symbols.com/
11 As that site is under copyright, the corpus I've been using to run this test
12 is not distributed with this source code.
13 '''
14 import os
15 import yaml
16 from glob import glob
17 import nltk
18 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
19 from nltk.tokenize.regexp import RegexpTokenizer
20 from nltk.chunk.regexp import RegexpParser
21 from mjacob.ontologybuilder import (OntologyBuilderFramework
22 , HighlightedTerms
23 , SimpleWordnetSynonyms
24 , SimpleConcepts
25 , SimpleConceptHierarchies
26 , SimpleChunkFilter
27 , SimpleSubterms
28 , NavigliTermFilter
30 from util.cached import Cacheable
32 VERSION = 4
33 NAME = 'symbol ontology generator V.%s' % (VERSION)
34 BASEDIR = os.path.split(os.path.abspath(__file__))[0]
35 CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache')
36 CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus')
37 CORPUSFILES = os.path.join(CORPUSDIR, '*.txt')
39 class SymbolOntologyBuilder(OntologyBuilderFramework,
40 HighlightedTerms,
41 SimpleWordnetSynonyms,
42 SimpleConcepts,
43 SimpleConceptHierarchies,
44 Cacheable,
45 dict):
46 """
47 SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
48 a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
49 and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
51 Note that all of the components REQUIRE that the base object be C{Cacheable}
52 """
54 OTHER_CORPORA = "terms.other_corpora"
55 ALPHA = "terms.NavigliTermFilter.alpha"
56 THRESHOLD = "terms.NavigliTermFilter.threshold"
58 def __init__(self, parameter_file, only_do=None, ignore_cache=None):
59 """
60 @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
61 @param only_do: if specified, only perform steps in this collection
62 @param ignore_cache: if specified, ignore any cached results from
63 steps specified in this collection. note that any new results
64 will still be saved to cache, possibly overwriting existing results.
65 """
66 self.__set_parameters(parameter_file)
67 self.__init_cacheable(ignore_cache=ignore_cache)
68 self.__init_framework(only_do=only_do)
69 self.__init_terms()
70 self.__init_synonyms()
71 self.__init_concepts()
72 self.__init_concept_hierarchies()
74 def __init_cacheable(self, ignore_cache):
75 """
76 Initialize the object as cacheable
77 """
78 Cacheable.__init__(self,
79 os.path.join(CACHEDIR, NAME),
80 ignore_cache=ignore_cache,
81 debug=True)
83 def __set_parameters(self, parmeter_file):
84 """
85 set parameters from the @param parameter file
86 """
87 with open(parmeter_file) as file:
88 dict.__init__(self, yaml.load(file.read()))
90 def __init_framework(self, only_do=None):
91 """
92 initialize the framework
93 """
94 OntologyBuilderFramework.__init__(self,
95 NAME,
96 only_do=only_do)
98 def __init_terms(self):
99 """
100 Initialize the term extraction process.
102 This creates a regular and highlight corpus, identifies the other coprora
103 specified in the parameters file, and creates basic tagger, chunker,
104 chunk_filter, subterm, and term filter objects.
106 For the regular corpus, the words are found using the regexp
107 'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
108 which identifies image references, numbers, abbreviations,
109 apostrophes, and word characters.
111 For the highlight corpus, terms are found using the regexp
112 '(?<=<)[^>]+(?=>)'
113 which is basically anything between angle backets.
115 The tagger is a fairly sophisticated model that is part of C{nltk}
117 The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
119 The term filter is a C{NavigliTermFilter} which is parameterized by the values
120 in the parameter file.
123 corpus = PlaintextCorpusReader(root=CORPUSDIR,
124 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
125 word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
126 encoding='utf-8')
127 highlights = PlaintextCorpusReader(root=CORPUSDIR,
128 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
129 word_tokenizer=RegexpTokenizer(r'(?<=<)[^>]+(?=>)'),
130 encoding='utf-8')
131 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
132 tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
133 chunker = RegexpParser('NP: {<JJ|CD|N.*>+}')
134 chunk_filter = SimpleChunkFilter('NP', minlength=2)
135 subterms = SimpleSubterms(minlength=2)
136 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
137 self[SymbolOntologyBuilder.THRESHOLD])
139 HighlightedTerms.__init__(self,
140 corpus,
141 highlights,
142 other_corpora,
143 tagger,
144 chunker,
145 chunk_filter,
146 subterms,
147 term_filter,
148 debug=True)
150 def __init_synonyms(self):
151 SimpleWordnetSynonyms.__init__(self)
153 def __init_concepts(self):
154 SimpleConcepts.__init__(self)
156 def __init_concept_hierarchies(self):
157 SimpleConceptHierarchies.__init__(self)
159 if __name__ == "__main__":
160 builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml',
161 only_do=set((
162 #OntologyBuilderFramework.TERMS,
163 #OntologyBuilderFramework.SYNONYMS,
164 OntologyBuilderFramework.CONCEPTS,
165 OntologyBuilderFramework.CONCEPT_HIERARCHIES
166 )),
167 ignore_cache=set((
168 #OntologyBuilderFramework.TERMS,
169 #OntologyBuilderFramework.SYNONYMS,
170 #OntologyBuilderFramework.CONCEPTS,
171 OntologyBuilderFramework.CONCEPT_HIERARCHIES,
173 state = builder.process()
175 # this will display the concept hierarchies, but takes A LONG TIME
176 #state[OntologyBuilderFramework.CONCEPT_HIERARCHIES].draw()