1 # This Python file uses the following encoding: utf-8
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree out of
18 from nltk
.corpus
.reader
.plaintext
import PlaintextCorpusReader
19 from nltk
.tokenize
.regexp
import RegexpTokenizer
20 from nltk
.chunk
.regexp
import RegexpParser
21 from mjacob
.ontologybuilder
import (OntologyBuilderFramework
23 , SimpleWordnetSynonyms
25 , SimpleConceptHierarchies
32 NAME
= 'symbol ontology generator V.%s' % (VERSION
)
33 BASEDIR
= os
.path
.split(os
.path
.abspath(__file__
))[0]
34 CACHEDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus_cache')
35 CORPUSDIR
= os
.path
.join(BASEDIR
, 'symbol_corpus')
36 CORPUSFILES
= os
.path
.join(CORPUSDIR
, '*.txt')
38 class SymbolOntologyBuilder(OntologyBuilderFramework
,
40 SimpleWordnetSynonyms
,
42 SimpleConceptHierarchies
,
44 OTHER_CORPORA
= "terms.other_corpora"
45 ALPHA
= "terms.NavigliTermFilter.alpha"
46 THRESHOLD
= "terms.NavigliTermFilter.threshold"
48 def __init__(self
, parameter_file
, only_do
=None, ignore_cache
=None):
49 self
.__set
_parameters
(parameter_file
)
50 self
.__init
_framework
(only_do
=only_do
, ignore_cache
=ignore_cache
)
52 self
.__init
_synonyms
()
53 self
.__init
_concepts
()
54 self
.__init
_concept
_hierarchies
()
56 def __set_parameters(self
, parmeter_file
):
57 with
open(parmeter_file
) as file:
58 dict.__init
__(self
, yaml
.load(file.read()))
60 def __init_framework(self
, only_do
=None, ignore_cache
=None):
61 OntologyBuilderFramework
.__init
__(self
,
65 ignore_cache
=ignore_cache
)
67 def __init_terms(self
):
68 corpus
= PlaintextCorpusReader(root
=CORPUSDIR
,
69 fileids
=[os
.path
.split(file)[1] for file in glob(CORPUSFILES
)],
70 word_tokenizer
=RegexpTokenizer(r
'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
72 highlights = PlaintextCorpusReader(root=CORPUSDIR,
73 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
74 word_tokenizer=RegexpTokenizer(r'(?
<=<)[^
>]+(?
=>)'),
76 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
77 tagger = nltk.data.load('taggers
/maxent_treebank_pos_tagger
/english
.pickle
')
78 chunker = RegexpParser('NP
: {<JJ|CD|N
.*>+}')
79 chunk_filter = SimpleChunkFilter('NP
', minlength=2)
80 subterms = SimpleSubterms(minlength=2)
81 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
82 self[SymbolOntologyBuilder.THRESHOLD])
84 HighlightedTerms.__init__(self,
93 def __init_synonyms(self):
96 def __init_concepts(self):
99 def __init_concept_hierarchies(self):
102 if __name__ == "__main__":
103 builder = SymbolOntologyBuilder('symbol_ontology_builder
.yaml
',
105 #OntologyBuilderFramework.TERMS,
106 #OntologyBuilderFramework.SYNONYMS,
107 OntologyBuilderFramework.CONCEPTS,
108 OntologyBuilderFramework.CONCEPT_HIERARCHIES)),
110 #OntologyBuilderFramework.TERMS,
111 #OntologyBuilderFramework.SYNONYMS,
112 #OntologyBuilderFramework.CONCEPTS,
113 OntologyBuilderFramework.CONCEPT_HIERARCHIES,)))
114 state = builder.process()
115 #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS]))