comments, some memoization, some other minor code changes
[nltk_ontology_framework.git] / test / standalone / symbol_ontology / symbol_ontology_test.py
blobaf2a5dba86f6910bf031b7f21159ecdedff6fa1b
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 7, 2011
5 @author: mjacob
7 This is a sample usage of the ontology framework developed in this project,
8 which generates a concept tree out of
13 '''
14 import os
15 import yaml
16 from glob import glob
17 import nltk
18 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
19 from nltk.tokenize.regexp import RegexpTokenizer
20 from nltk.chunk.regexp import RegexpParser
21 from mjacob.ontologybuilder import (OntologyBuilderFramework
22 , HighlightedTerms
23 , SimpleWordnetSynonyms
24 , SimpleConcepts
25 , SimpleConceptHierarchies
26 , SimpleChunkFilter
27 , SimpleSubterms
28 , NavigliTermFilter
31 VERSION = 4
32 NAME = 'symbol ontology generator V.%s' % (VERSION)
33 BASEDIR = os.path.split(os.path.abspath(__file__))[0]
34 CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache')
35 CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus')
36 CORPUSFILES = os.path.join(CORPUSDIR, '*.txt')
38 class SymbolOntologyBuilder(OntologyBuilderFramework,
39 HighlightedTerms,
40 SimpleWordnetSynonyms,
41 SimpleConcepts,
42 SimpleConceptHierarchies,
43 dict):
44 OTHER_CORPORA = "terms.other_corpora"
45 ALPHA = "terms.NavigliTermFilter.alpha"
46 THRESHOLD = "terms.NavigliTermFilter.threshold"
48 def __init__(self, parameter_file, only_do=None, ignore_cache=None):
49 self.__set_parameters(parameter_file)
50 self.__init_framework(only_do=only_do, ignore_cache=ignore_cache)
51 self.__init_terms()
52 self.__init_synonyms()
53 self.__init_concepts()
54 self.__init_concept_hierarchies()
56 def __set_parameters(self, parmeter_file):
57 with open(parmeter_file) as file:
58 dict.__init__(self, yaml.load(file.read()))
60 def __init_framework(self, only_do=None, ignore_cache=None):
61 OntologyBuilderFramework.__init__(self,
62 NAME,
63 CACHEDIR,
64 only_do=only_do,
65 ignore_cache=ignore_cache)
67 def __init_terms(self):
68 corpus = PlaintextCorpusReader(root=CORPUSDIR,
69 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
70 word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
71 encoding='utf-8')
72 highlights = PlaintextCorpusReader(root=CORPUSDIR,
73 fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
74 word_tokenizer=RegexpTokenizer(r'(?<=<)[^>]+(?=>)'),
75 encoding='utf-8')
76 other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
77 tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
78 chunker = RegexpParser('NP: {<JJ|CD|N.*>+}')
79 chunk_filter = SimpleChunkFilter('NP', minlength=2)
80 subterms = SimpleSubterms(minlength=2)
81 term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
82 self[SymbolOntologyBuilder.THRESHOLD])
84 HighlightedTerms.__init__(self,
85 corpus,
86 highlights,
87 other_corpora,
88 tagger,
89 chunker,
90 chunk_filter,
91 subterms,
92 term_filter)
93 def __init_synonyms(self):
94 pass
96 def __init_concepts(self):
97 pass
99 def __init_concept_hierarchies(self):
100 pass
102 if __name__ == "__main__":
103 builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml',
104 only_do=set((
105 #OntologyBuilderFramework.TERMS,
106 #OntologyBuilderFramework.SYNONYMS,
107 OntologyBuilderFramework.CONCEPTS,
108 OntologyBuilderFramework.CONCEPT_HIERARCHIES)),
109 ignore_cache=set((
110 #OntologyBuilderFramework.TERMS,
111 #OntologyBuilderFramework.SYNONYMS,
112 #OntologyBuilderFramework.CONCEPTS,
113 OntologyBuilderFramework.CONCEPT_HIERARCHIES,)))
114 state = builder.process()
115 #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS]))