1 # This Python file uses the following encoding: utf-8
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree from terms discovered in a corpus of texts
9 which describe various symbols, pulled from http://www.symbols.com/
11 As that site is under copyright, the corpus I've been using to run this test
12 is not distributed with this source code.
18 from nltk
.corpus
.reader
.plaintext
import PlaintextCorpusReader
19 from nltk
.tokenize
.regexp
import RegexpTokenizer
20 from nltk
.chunk
.regexp
import RegexpParser
21 from mjacob
.ontologybuilder
import (OntologyBuilderFramework
23 , SimpleWordnetSynonyms
25 , SimpleConceptHierarchies
30 from util
.cached
import Cacheable
33 NAME
= 'symbol ontology generator V.%s' % (VERSION
)
34 BASEDIR
= os
.path
.split(os
.path
.abspath(__file__
))[0]
35 CACHEDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus_cache')
36 CORPUSDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus')
37 CORPUSFILES
= os
.path
.join(CORPUSDIR
, '*.txt')
39 class SymbolOntologyBuilder(OntologyBuilderFramework
,
41 SimpleWordnetSynonyms
,
43 SimpleConceptHierarchies
,
47 SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
48 a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
49 and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
51 Note that all of the components REQUIRE that the base object be C{Cacheable}
54 OTHER_CORPORA
= "terms.other_corpora"
55 ALPHA
= "terms.NavigliTermFilter.alpha"
56 THRESHOLD
= "terms.NavigliTermFilter.threshold"
58 def __init__(self
, parameter_file
, only_do
=None, ignore_cache
=None):
60 @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
61 @param only_do: if specified, only perform steps in this collection
62 @param ignore_cache: if specified, ignore any cached results from
63 steps specified in this collection. note that any new results
64 will still be saved to cache, possibly overwriting existing results.
66 self
.__set
_parameters
(parameter_file
)
67 self
.__init
_cacheable
(ignore_cache
=ignore_cache
)
68 self
.__init
_framework
(only_do
=only_do
)
70 self
.__init
_synonyms
()
71 self
.__init
_concepts
()
72 self
.__init
_concept
_hierarchies
()
74 def __init_cacheable(self
, ignore_cache
):
76 Initialize the object as cacheable
78 Cacheable
.__init
__(self
,
79 os
.path
.join(CACHEDIR
, NAME
),
80 ignore_cache
=ignore_cache
,
83 def __set_parameters(self
, parmeter_file
):
85 set parameters from the @param parameter file
87 with
open(parmeter_file
) as file:
88 dict.__init
__(self
, yaml
.load(file.read()))
90 def __init_framework(self
, only_do
=None):
92 initialize the framework
94 OntologyBuilderFramework
.__init
__(self
,
98 def __init_terms(self
):
100 Initialize the term extraction process.
102 This creates a regular and highlight corpus, identifies the other coprora
103 specified in the parameters file, and creates basic tagger, chunker,
104 chunk_filter, subterm, and term filter objects.
106 For the regular corpus, the words are found using the regexp
107 'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
108 which identifies image references, numbers, abbreviations,
109 apostrophes, and word characters.
111 For the highlight corpus, terms are found using the regexp
113 which is basically anything between angle backets.
115 The tagger is a fairly sophisticated model that is part of C{nltk}
117 The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
119 The term filter is a C{NavigliTermFilter} which is parameterized by the values
120 in the parameter file.
123 corpus
= PlaintextCorpusReader(root
=CORPUSDIR
,
124 fileids
=[os
.path
.split(file)[1] for file in glob(CORPUSFILES
)],
125 word_tokenizer
=RegexpTokenizer(r
'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
127 highlights = PlaintextCorpusReader(root=CORPUSDIR,
128 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
129 word_tokenizer=RegexpTokenizer(r'(?
<=<)[^
>]+(?
=>)'),
131 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
132 tagger = nltk.data.load('taggers
/maxent_treebank_pos_tagger
/english
.pickle
')
133 chunker = RegexpParser('NP
: {<JJ|CD|N
.*>+}')
134 chunk_filter = SimpleChunkFilter('NP
', minlength=2)
135 subterms = SimpleSubterms(minlength=2)
136 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
137 self[SymbolOntologyBuilder.THRESHOLD])
139 HighlightedTerms.__init__(self,
150 def __init_synonyms(self):
151 SimpleWordnetSynonyms.__init__(self)
153 def __init_concepts(self):
154 SimpleConcepts.__init__(self)
156 def __init_concept_hierarchies(self):
157 SimpleConceptHierarchies.__init__(self)
159 if __name__ == "__main__":
160 builder = SymbolOntologyBuilder('symbol_ontology_builder
.yaml
',
162 #OntologyBuilderFramework.TERMS,
163 #OntologyBuilderFramework.SYNONYMS,
164 OntologyBuilderFramework.CONCEPTS,
165 OntologyBuilderFramework.CONCEPT_HIERARCHIES
168 #OntologyBuilderFramework.TERMS,
169 #OntologyBuilderFramework.SYNONYMS,
170 #OntologyBuilderFramework.CONCEPTS,
171 OntologyBuilderFramework.CONCEPT_HIERARCHIES,
173 state = builder.process()
175 # this will display the concept hierarchies, but takes A LONG TIME
176 #state[OntologyBuilderFramework.CONCEPT_HIERARCHIES].draw()