test/standalone/symbol_ontology/symbol_ontology_test.py

   1 # This Python file uses the following encoding: utf-8
   2 '''
   3 Created on May 7, 2011
   4
   5 @author: mjacob
   6
   7 This is a sample usage of the ontology framework developed in this project,
   8 which generates a concept tree from terms discovered in a corpus of texts
   9 which describe various symbols, pulled from http://www.symbols.com/
  10
  11 As that site is under copyright, the corpus I've been using to run this test
  12 is not distributed with this source code.
  13 '''
  14 import os
  15 import yaml
  16 from glob import glob
  17 import nltk
  18 from nltk.corpus.reader.plaintext import PlaintextCorpusReader
  19 from nltk.tokenize.regexp import RegexpTokenizer
  20 from nltk.chunk.regexp import RegexpParser
  21 from mjacob.ontologybuilder import (OntologyBuilderFramework
  22                                   , HighlightedTerms
  23                                   , SimpleWordnetSynonyms
  24                                   , SimpleConcepts
  25                                   , SimpleConceptHierarchies
  26                                   , SimpleChunkFilter
  27                                   , SimpleSubterms
  28                                   , NavigliTermFilter
  29                                     )
  30 from util.cached import Cacheable
  31
  32 VERSION = 4
  33 NAME = 'symbol ontology generator V.%s' % (VERSION)
  34 BASEDIR = os.path.split(os.path.abspath(__file__))[0]
  35 CACHEDIR = os.path.join(BASEDIR, 'symbol_corpus_cache')
  36 CORPUSDIR = os.path.join(BASEDIR, 'symbol_corpus')
  37 CORPUSFILES = os.path.join(CORPUSDIR, '*.txt')
  38
  39 class SymbolOntologyBuilder(OntologyBuilderFramework,
  40                             HighlightedTerms,
  41                             SimpleWordnetSynonyms,
  42                             SimpleConcepts,
  43                             SimpleConceptHierarchies,
  44                             Cacheable,
  45                             dict):
  46     """
  47     SymbolOntologyBuilder is implemented on top of the C{OntologyBuilderFramework}, as
  48     a combination of C{HighlightedTerms}, C{SimpleWordnetSynonyms}, C{SimpleConcepts},
  49     and C{SimpleConceptHierarchies}, as well as being C{Cacheable}.
  50
  51     Note that all of the components REQUIRE that the base object be C{Cacheable}
  52     """
  53
  54     OTHER_CORPORA = "terms.other_corpora"
  55     ALPHA = "terms.NavigliTermFilter.alpha"
  56     THRESHOLD = "terms.NavigliTermFilter.threshold"
  57
  58     def __init__(self, parameter_file, only_do=None, ignore_cache=None):
  59         """
  60         @param paramter_file: a file containing parameters for the SymbolOntologyGenerator
  61         @param only_do: if specified, only perform steps in this collection
  62         @param ignore_cache: if specified, ignore any cached results from
  63             steps specified in this collection. note that any new results
  64             will still be saved to cache, possibly overwriting existing results.
  65         """
  66         self.__set_parameters(parameter_file)
  67         self.__init_cacheable(ignore_cache=ignore_cache)
  68         self.__init_framework(only_do=only_do)
  69         self.__init_terms()
  70         self.__init_synonyms()
  71         self.__init_concepts()
  72         self.__init_concept_hierarchies()
  73
  74     def __init_cacheable(self, ignore_cache):
  75         """
  76         Initialize the object as cacheable
  77         """
  78         Cacheable.__init__(self,
  79                            os.path.join(CACHEDIR, NAME),
  80                            ignore_cache=ignore_cache,
  81                            debug=True)
  82
  83     def __set_parameters(self, parmeter_file):
  84         """
  85         set parameters from the @param parameter file
  86         """
  87         with open(parmeter_file) as file:
  88             dict.__init__(self, yaml.load(file.read()))
  89
  90     def __init_framework(self, only_do=None):
  91         """
  92         initialize the framework
  93         """
  94         OntologyBuilderFramework.__init__(self,
  95                                           NAME,
  96                                           only_do=only_do)
  97
  98     def __init_terms(self):
  99         """
 100         Initialize the term extraction process.
 101
 102         This creates a regular and highlight corpus, identifies the other coprora
 103         specified in the parameters file, and creates basic tagger, chunker,
 104         chunk_filter, subterm, and term filter objects.
 105
 106         For the regular corpus, the words are found using the regexp
 107             'img#\w+|#\d+|(\w\.){2:}|[\w\']+'
 108         which identifies image references, numbers, abbreviations,
 109         apostrophes, and word characters.
 110
 111         For the highlight corpus, terms are found using the regexp
 112             '(?<=<)[^>]+(?=>)'
 113         which is basically anything between angle backets.
 114
 115         The tagger is a fairly sophisticated model that is part of C{nltk}
 116
 117         The chunker assumes noun phrases consist of adjectives, nouns, and numbers.
 118
 119         The term filter is a  C{NavigliTermFilter} which is parameterized by the values
 120         in the parameter file.
 121
 122         """
 123         corpus = PlaintextCorpusReader(root=CORPUSDIR,
 124                                        fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
 125                                        word_tokenizer=RegexpTokenizer(r'img#\w+|#\d+|(\w\.){2:}|[\w\']+'),
 126                                        encoding='utf-8')
 127         highlights = PlaintextCorpusReader(root=CORPUSDIR,
 128                                            fileids=[os.path.split(file)[1] for file in glob(CORPUSFILES)],
 129                                            word_tokenizer=RegexpTokenizer(r'(?<=<)[^>]+(?=>)'),
 130                                            encoding='utf-8')
 131         other_corpora = self[SymbolOntologyBuilder.OTHER_CORPORA]
 132         tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
 133         chunker = RegexpParser('NP: {<JJ|CD|N.*>+}')
 134         chunk_filter = SimpleChunkFilter('NP', minlength=2)
 135         subterms = SimpleSubterms(minlength=2)
 136         term_filter = NavigliTermFilter(self[SymbolOntologyBuilder.ALPHA],
 137                                         self[SymbolOntologyBuilder.THRESHOLD])
 138
 139         HighlightedTerms.__init__(self,
 140                                   corpus,
 141                                   highlights,
 142                                   other_corpora,
 143                                   tagger,
 144                                   chunker,
 145                                   chunk_filter,
 146                                   subterms,
 147                                   term_filter,
 148                                   debug=True)
 149
 150     def __init_synonyms(self):
 151         SimpleWordnetSynonyms.__init__(self)
 152
 153     def __init_concepts(self):
 154         SimpleConcepts.__init__(self)
 155
 156     def __init_concept_hierarchies(self):
 157         SimpleConceptHierarchies.__init__(self)
 158
 159 if __name__ == "__main__":
 160     builder = SymbolOntologyBuilder('symbol_ontology_builder.yaml',
 161                                     only_do=set((
 162                                                  #OntologyBuilderFramework.TERMS,
 163                                                  #OntologyBuilderFramework.SYNONYMS,
 164                                                  OntologyBuilderFramework.CONCEPTS,
 165                                                  OntologyBuilderFramework.CONCEPT_HIERARCHIES
 166                                                  )),
 167                                     ignore_cache=set((
 168                                                       #OntologyBuilderFramework.TERMS,
 169                                                       #OntologyBuilderFramework.SYNONYMS,
 170                                                       #OntologyBuilderFramework.CONCEPTS,
 171                                                       OntologyBuilderFramework.CONCEPT_HIERARCHIES,
 172                                                       )))
 173     state = builder.process()
 174
 175     # this will display the concept hierarchies, but takes A LONG TIME
 176     #state[OntologyBuilderFramework.CONCEPT_HIERARCHIES].draw()