doc finished, just tweaking some stuff
[nltk_ontology_framework.git] / src / mjacob / ontologybuilder / simple_concept_hierarchies.py
blob5ff7ded60c04f6838908ae4c8c8c8e83acd80dbd
1 # This Python file uses the following encoding: utf-8
2 '''
3 Created on May 5, 2011
5 @author: mjacob
6 '''
7 from framework import OntologyBuilderFramework, OntologyBuilderConceptHierarchies
8 from nltk.tree import Tree
9 from itertools import chain, combinations
10 from nltk.corpus import wordnet
11 from util.cached import Cached
12 from util.memoized import Memoized
13 wn=wordnet
15 class SimpleConceptHierarchies(OntologyBuilderConceptHierarchies):
16 """
17 constructs a hierarchical model of concepts representing terms based upon
18 subterm inclusion and heirarchical relations amongst the least significant
19 subterms.
21 the hierarchy is built in a substrate of a minimal subset of terms from
22 wordnet, based around the "entity" synset.
23 """
25 def __init__(self, root_synset='entity.n.01', acceptable_pos=[wn.NOUN]):
26 """initialze a couple of constants"""
27 self.__root_synset = root_synset
28 self.__acceptable_pos = acceptable_pos
30 @Cached(lambda *x, **y: OntologyBuilderFramework.CONCEPT_HIERARCHIES)
31 def _get_concept_hierarchies(self, **state):
32 """
33 given concepts, which are sequences of singular synsets, construct a tree
34 of concepts around the "entity.n.01" synset in worndnet.
36 a concept A will be placed higher than a concept B in the Tree if
37 1. len(A) == len(B) and
38 A[1:] == B[1:] and
39 A[0] is a hypernym of B[0] in wordnet
41 2. len(A) < len(B) and
42 B = [b0,..bn,a0,..am] where A = [a0,...,am]
44 intermediary concepts in wordnet will be inserted to differentiate between final terms
45 """
46 concepts = state[OntologyBuilderFramework.CONCEPTS]
48 groups_by_head = {}
49 for concept in concepts:
50 head = concept[-1]
51 head_synset = wn.synset(head)
52 if self.__is_a_legitimate_head_synset(head_synset):
53 if head in groups_by_head:
54 groups_by_head[head].add(concept)
55 else:
56 groups_by_head[head] = set((concept,))
58 group_trees = []
59 for head in groups_by_head:
60 group_trees.append(self.__merge_group(head, groups_by_head[head]))
62 merged = self.__merge_groups(group_trees)
64 self.__sort_tree(merged)
66 self.__attach_terms_by_concept(merged, concepts)
68 return merged
70 @Memoized
71 def __is_a_legitimate_head_synset(self, head_synset):
72 """
73 returns true if the head synset is a noun, and if "entity" is the head of one of its chains
74 """
75 return head_synset.pos in self.__acceptable_pos and wn.synset(self.__root_synset) in set(chain(*head_synset.hypernym_paths()))
77 def __sort_tree(self, tree):
78 """sort a subtree so that its nodes are in lexicographic order.
79 note that this does not seem to work."""
80 for subtree in tree:
81 subtree.sort()
83 def __attach_terms_by_concept(self, tree, concepts):
84 """
85 attach the actual terms discovered in the corpus to the tree
86 """
87 for subtree in tree.subtrees():
88 if subtree.node in concepts:
89 for term in concepts[subtree.node]:
90 if term not in subtree:
91 subtree.append(term)
93 def __is_a_hypernym(self, a, b):
94 """
95 returns true if the concept @param a is a hypernym of the concept @param b
96 """
97 return self.__endswith(a, b) or (len(a) == len(b) and
98 a[1:] == b[1:] and
99 self.__is_synset_a_hypernym(a[0], b[0]))
101 def __endswith(self, a, b):
102 """returns true if the list b ends with the list a"""
103 if len(a) >= len(b):
104 return False
106 for i in xrange(len(a)):
107 if a[i] != b[len(b) - len(a) + i]:
108 return False
109 return True
111 @Memoized
112 def __is_synset_a_hypernym(self, a, b):
113 """ returns true if the synset @param a is a hypernym of the synset @param b """
114 return a != b and wn.synset(a) in chain(*wn.synset(b).hypernym_paths())
116 def __merge_group(self, head, group):
117 """merges a group of synsets with the same head"""
118 tree = Tree((head,), [])
119 for element in group:
120 el_tree = Tree(element, [])
121 self.__merge(el_tree, tree)
123 return tree
125 def __merge(self, element, tree):
126 """merge a element of a tree into a larger tree"""
127 if element in tree:
128 return
130 found_subtree = False
131 for leaf in tree:
132 if self.__is_a_hypernym(leaf.node, element.node):
133 self.__merge(element, leaf)
134 found_subtree = True
136 was_above = False
137 for leaf in tree:
138 if self.__is_a_hypernym(element.node, leaf.node):
139 if not leaf in element:
140 element.append(leaf)
141 was_above = True
143 if was_above:
144 for leaf in element:
145 if leaf in tree:
146 del tree[tree.index(leaf)]
147 if not element in tree:
148 tree.append(element)
150 elif not found_subtree:
151 if not element in tree:
152 tree.append(element)
154 def __merge_groups(self, groups):
155 """merge group of subtrees into a single tree, rooted at the root synset entity.n.01"""
156 tree = Tree((self.__root_synset,), [])
158 found = set((self.__root_synset,))
159 for a, b in combinations(groups, 2):
160 common = wn.synset(a.node[-1]).common_hypernyms(wn.synset(b.node[-1]))
161 if common:
162 common_node = common[0].name
163 if not common_node in found:
164 self.__merge(Tree((common_node,), []), tree)
165 found.add(common_node)
167 for group in groups:
168 self.__merge(group, tree)
170 return tree