1 # This Python file uses the following encoding: utf-8
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree from terms discovered in a corpus of texts
9 which describe various symbols, pulled from http://www.symbols.com/
11 As that site is under copyright, the corpus I've been using to run this test
12 is not distributed with this source code.
18 from nltk
.corpus
.reader
.plaintext
import PlaintextCorpusReader
19 from nltk
.tokenize
.regexp
import RegexpTokenizer
20 from nltk
.chunk
.regexp
import RegexpParser
21 from mjacob
.ontologybuilder
import (OntologyBuilderFramework
23 , SimpleWordnetSynonyms
25 , SimpleConceptHierarchies
30 from util
.cached
import Cacheable
33 NAME
= 'symbol ontology generator V.%s' % (VERSION
)
34 BASEDIR
= os
.path
.split(os
.path
.abspath(__file__
))[0]
35 CACHEDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus_cache')
36 CORPUSDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus')
37 CORPUSFILES
= os
.path
.join(CORPUSDIR
, '*.txt')
39 class SymbolOntologyBuilder(OntologyBuilderFramework
,
41 SimpleWordnetSynonyms
,
43 SimpleConceptHierarchies
,
47 SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
48 a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
49 and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
52 OTHER_CORPORA
= "terms.other_corpora"
53 ALPHA
= "terms.NavigliTermFilter.alpha"
54 THRESHOLD
= "terms.NavigliTermFilter.threshold"
56 def __init__(self
, parameter_file
, only_do
=None, ignore_cache
=None):
58 @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
59 @param only_do: if specified, only perform steps in this collection
60 @param ignore_cache: if specified, ignore any cached results from
61 steps specified in this collection. note that any new results
62 will still be saved to cache, possibly overwriting existing results.
64 self
.__set
_parameters
(parameter_file
)
65 self
.__init
_cacheable
(ignore_cache
=ignore_cache
)
66 self
.__init
_framework
(only_do
=only_do
)
68 self
.__init
_synonyms
()
69 self
.__init
_concepts
()
70 self
.__init
_concept
_hierarchies
()
72 def __init_cacheable(self
, ignore_cache
):
74 Initialize the object as cacheable
76 Cacheable
.__init
__(self
,
77 os
.path
.join(CACHEDIR
, NAME
),
78 ignore_cache
=ignore_cache
,
81 def __set_parameters(self
, parmeter_file
):
83 set parameters from the @param parameter file
85 with
open(parmeter_file
) as file:
86 dict.__init
__(self
, yaml
.load(file.read()))
88 def __init_framework(self
, only_do
=None):
90 initialize the framework
92 OntologyBuilderFramework
.__init
__(self
,
96 def __init_terms(self
):
98 Initialize the term extraction process.
100 This creates a regular and highlight corpus, identifies the other coprora
101 specified in the parameters file, and creates basic tagger, chunker,
102 chunk_filter, subterm, and term filter objects.
104 For the regular corpus, the words are found using the regexp
105 'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
106 which identifies image references, numbers, abbreviations,
107 apostrophes, and word characters.
109 For the highlight corpus, terms are found using the regexp
111 which is basically anything between angle backets.
113 The tagger is a fairly sophisticated model that is part of C{nltk}
115 The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
117 The term filter is a C{NavigliTermFilter} which is parameterized by the values
118 in the parameter file.
121 corpus
= PlaintextCorpusReader(root
=CORPUSDIR
,
122 fileids
=[os
.path
.split(file)[1] for file in glob(CORPUSFILES
)],
123 word_tokenizer
=RegexpTokenizer(r
'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
125 highlights = PlaintextCorpusReader(root=CORPUSDIR,
126 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
127 word_tokenizer=RegexpTokenizer(r'(?
<=<)[^
>]+(?
=>)'),
129 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
130 tagger = nltk.data.load('taggers
/maxent_treebank_pos_tagger
/english
.pickle
')
131 chunker = RegexpParser('NP
: {<JJ|CD|N
.*>+}')
132 chunk_filter = SimpleChunkFilter('NP
', minlength=2)
133 subterms = SimpleSubterms(minlength=2)
134 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
135 self[SymbolOntologyBuilder.THRESHOLD])
137 HighlightedTerms.__init__(self,
147 def __init_synonyms(self):
148 SimpleWordnetSynonyms.__init__(self)
150 def __init_concepts(self):
151 SimpleConcepts.__init__(self)
153 def __init_concept_hierarchies(self):
154 SimpleConceptHierarchies.__init__(self)
156 if __name__ == "__main__":
157 builder = SymbolOntologyBuilder('symbol_ontology_builder
.yaml
',
159 OntologyBuilderFramework.TERMS,
160 OntologyBuilderFramework.SYNONYMS,
161 OntologyBuilderFramework.CONCEPTS,
162 OntologyBuilderFramework.CONCEPT_HIERARCHIES
165 #OntologyBuilderFramework.TERMS,
166 #OntologyBuilderFramework.SYNONYMS,
167 #OntologyBuilderFramework.CONCEPTS,
168 #OntologyBuilderFramework.CONCEPT_HIERARCHIES,
170 state = builder.process()
171 #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS]))