test/standalone/symbol_ontology/symbol_ontology_test.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 7, 2011
   4
   5 @author: mjacob
   6
   7 This is a sample usage of the ontology framework developed in this project,
   8 which generates a concept tree from terms discovered in a corpus of texts
   9 which describe various symbols, pulled from http://www.symbols.com/
  10
  11 As that site is under copyright, the corpus I've been using to run this test
  12 is not distributed with this source code.
  13 '''
  14 import os
  15 import yaml
  16 from glob import glob
  17 import nltk
  18 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
  19 from nltk.tokenize.regexp import RegexpTokenizer
  20 from nltk.chunk.regexp import RegexpParser
  21 from mjacob.ontologybuilder import (OntologyBuilderFramework
  22                                   , HighlightedTerms
  23                                   , SimpleWordnetSynonyms
  24                                   , SimpleConcepts
  25                                   , SimpleConceptHierarchies
  26                                   , SimpleChunkFilter
  27                                   , SimpleSubterms
  28                                   , NavigliTermFilter
  29                                     )
  30 from util.cached import Cacheable
  31
  32 VERSION = 4
  33 NAME = 'symbol ontology generator V.%s' % (VERSION)
  34 BASEDIR = os.path.split(os.path.abspath(__file__))[0]
  35 CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache')
  36 CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus')
  37 CORPUSFILES = os.path.join(CORPUSDIR, '*.txt')
  38
  39 class SymbolOntologyBuilder(OntologyBuilderFramework,
  40                             HighlightedTerms,
  41                             SimpleWordnetSynonyms,
  42                             SimpleConcepts,
  43                             SimpleConceptHierarchies,
  44                             Cacheable,
  45                             dict):
  46     """
  47     SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
  48     a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
  49     and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
  50     """
  51
  52     OTHER_CORPORA = "terms.other_corpora"
  53     ALPHA = "terms.NavigliTermFilter.alpha"
  54     THRESHOLD = "terms.NavigliTermFilter.threshold"
  55
  56     def __init__(self, parameter_file, only_do=None, ignore_cache=None):
  57         """
  58         @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
  59         @param only_do: if specified, only perform steps in this collection
  60         @param ignore_cache: if specified, ignore any cached results from
  61             steps specified in this collection. note that any new results
  62             will still be saved to cache, possibly overwriting existing results.
  63         """
  64         self.__set_parameters(parameter_file)
  65         self.__init_cacheable(ignore_cache=ignore_cache)
  66         self.__init_framework(only_do=only_do)
  67         self.__init_terms()
  68         self.__init_synonyms()
  69         self.__init_concepts()
  70         self.__init_concept_hierarchies()
  71
  72     def __init_cacheable(self, ignore_cache):
  73         """
  74         Initialize the object as cacheable
  75         """
  76         Cacheable.__init__(self,
  77                            os.path.join(CACHEDIR, NAME),
  78                            ignore_cache=ignore_cache,
  79                            debug=True)
  80
  81     def __set_parameters(self, parmeter_file):
  82         """
  83         set parameters from the @param parameter file
  84         """
  85         with open(parmeter_file) as file:
  86             dict.__init__(self, yaml.load(file.read()))
  87
  88     def __init_framework(self, only_do=None):
  89         """
  90         initialize the framework
  91         """
  92         OntologyBuilderFramework.__init__(self,
  93                                           NAME,
  94                                           only_do=only_do)
  95
  96     def __init_terms(self):
  97         """
  98         Initialize the term extraction process.
  99
 100         This creates a regular and highlight corpus, identifies the other coprora
 101         specified in the parameters file, and creates basic tagger, chunker,
 102         chunk_filter, subterm, and term filter objects.
 103
 104         For the regular corpus, the words are found using the regexp
 105             'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
 106         which identifies image references, numbers, abbreviations,
 107         apostrophes, and word characters.
 108
 109         For the highlight corpus, terms are found using the regexp
 110             '(?<=<)[^>]+(?=>)'
 111         which is basically anything between angle backets.
 112
 113         The tagger is a fairly sophisticated model that is part of C{nltk}
 114
 115         The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
 116
 117         The term filter is a  C{NavigliTermFilter} which is parameterized by the values
 118         in the parameter file.
 119
 120         """
 121         corpus = PlaintextCorpusReader(root=CORPUSDIR,
 122                                        fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
 123                                        word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
 124                                        encoding='utf-8')
 125         highlights = PlaintextCorpusReader(root=CORPUSDIR,
 126                                            fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
 127                                            word_tokenizer=RegexpTokenizer(r'(?<=<)[^>]+(?=>)'),
 128                                            encoding='utf-8')
 129         other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
 130         tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
 131         chunker = RegexpParser('NP: {<JJ|CD|N.*>+}')
 132         chunk_filter = SimpleChunkFilter('NP', minlength=2)
 133         subterms = SimpleSubterms(minlength=2)
 134         term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
 135                                         self[SymbolOntologyBuilder.THRESHOLD])
 136
 137         HighlightedTerms.__init__(self,
 138                                   corpus,
 139                                   highlights,
 140                                   other_corpora,
 141                                   tagger,
 142                                   chunker,
 143                                   chunk_filter,
 144                                   subterms,
 145                                   term_filter)
 146
 147     def __init_synonyms(self):
 148         SimpleWordnetSynonyms.__init__(self)
 149
 150     def __init_concepts(self):
 151         SimpleConcepts.__init__(self)
 152
 153     def __init_concept_hierarchies(self):
 154         SimpleConceptHierarchies.__init__(self)
 155
 156 if __name__ == "__main__":
 157     builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml',
 158                                     only_do=set((
 159                                                  OntologyBuilderFramework.TERMS,
 160                                                  OntologyBuilderFramework.SYNONYMS,
 161                                                  OntologyBuilderFramework.CONCEPTS,
 162                                                  OntologyBuilderFramework.CONCEPT_HIERARCHIES
 163                                                  )),
 164                                     ignore_cache=set((
 165                                                       #OntologyBuilderFramework.TERMS,
 166                                                       #OntologyBuilderFramework.SYNONYMS,
 167                                                       #OntologyBuilderFramework.CONCEPTS,
 168                                                       #OntologyBuilderFramework.CONCEPT_HIERARCHIES,
 169                                                       )))
 170     state = builder.process()
 171     #print "\n".join(sorted(state[OntologyBuilderFramework.TERMS]))