1 # This Python file uses the following encoding: utf-8
7 from framework
import OntologyBuilderFramework
, OntologyBuilderConceptHierarchies
8 from nltk
.tree
import Tree
9 from itertools
import chain
, combinations
10 from nltk
.corpus
import wordnet
11 from util
.cached
import Cached
12 from util
.memoized
import Memoized
15 class SimpleConceptHierarchies(OntologyBuilderConceptHierarchies
):
17 constructs a hierarchical model of concepts representing terms based upon
18 subterm inclusion and heirarchical relations amongst the least significant
21 the hierarchy is built in a substrate of a minimal subset of terms from
22 wordnet, based around the "entity" synset.
25 def __init__(self
, root_synset
='entity.n.01', acceptable_pos
=[wn
.NOUN
]):
26 """initialze a couple of constants"""
27 self
.__root
_synset
= root_synset
28 self
.__acceptable
_pos
= acceptable_pos
30 @Cached(lambda *x
, **y
: OntologyBuilderFramework
.CONCEPT_HIERARCHIES
)
31 def _get_concept_hierarchies(self
, **state
):
33 given concepts, which are sequences of singular synsets, construct a tree
34 of concepts around the "entity.n.01" synset in worndnet.
36 a concept A will be placed higher than a concept B in the Tree if
37 1. len(A) == len(B) and
39 A[0] is a hypernym of B[0] in wordnet
41 2. len(A) < len(B) and
42 B = [b0,..bn,a0,..am] where A = [a0,...,am]
44 intermediary concepts in wordnet will be inserted to differentiate between final terms
46 concepts
= state
[OntologyBuilderFramework
.CONCEPTS
]
49 for concept
in concepts
:
51 head_synset
= wn
.synset(head
)
52 if self
.__is
_a
_legitimate
_head
_synset
(head_synset
):
53 if head
in groups_by_head
:
54 groups_by_head
[head
].add(concept
)
56 groups_by_head
[head
] = set((concept
,))
59 for head
in groups_by_head
:
60 group_trees
.append(self
.__merge
_group
(head
, groups_by_head
[head
]))
62 merged
= self
.__merge
_groups
(group_trees
)
64 self
.__sort
_tree
(merged
)
66 self
.__attach
_terms
_by
_concept
(merged
, concepts
)
71 def __is_a_legitimate_head_synset(self
, head_synset
):
73 returns true if the head synset is a noun, and if "entity" is the head of one of its chains
75 return head_synset
.pos
in self
.__acceptable
_pos
and wn
.synset(self
.__root
_synset
) in set(chain(*head_synset
.hypernym_paths()))
77 def __sort_tree(self
, tree
):
78 """sort a subtree so that its nodes are in lexicographic order.
79 note that this does not seem to work."""
83 def __attach_terms_by_concept(self
, tree
, concepts
):
85 attach the actual terms discovered in the corpus to the tree
87 for subtree
in tree
.subtrees():
88 if subtree
.node
in concepts
:
89 for term
in concepts
[subtree
.node
]:
90 if term
not in subtree
:
93 def __is_a_hypernym(self
, a
, b
):
95 returns true if the concept @param a is a hypernym of the concept @param b
97 return self
.__endswith
(a
, b
) or (len(a
) == len(b
) and
99 self
.__is
_synset
_a
_hypernym
(a
[0], b
[0]))
101 def __endswith(self
, a
, b
):
102 """returns true if the list b ends with the list a"""
106 for i
in xrange(len(a
)):
107 if a
[i
] != b
[len(b
) - len(a
) + i
]:
112 def __is_synset_a_hypernym(self
, a
, b
):
113 """ returns true if the synset @param a is a hypernym of the synset @param b """
114 return a
!= b
and wn
.synset(a
) in chain(*wn
.synset(b
).hypernym_paths())
116 def __merge_group(self
, head
, group
):
117 """merges a group of synsets with the same head"""
118 tree
= Tree((head
,), [])
119 for element
in group
:
120 el_tree
= Tree(element
, [])
121 self
.__merge
(el_tree
, tree
)
125 def __merge(self
, element
, tree
):
126 """merge a element of a tree into a larger tree"""
130 found_subtree
= False
132 if self
.__is
_a
_hypernym
(leaf
.node
, element
.node
):
133 self
.__merge
(element
, leaf
)
138 if self
.__is
_a
_hypernym
(element
.node
, leaf
.node
):
139 if not leaf
in element
:
146 del tree
[tree
.index(leaf
)]
147 if not element
in tree
:
150 elif not found_subtree
:
151 if not element
in tree
:
154 def __merge_groups(self
, groups
):
155 """merge group of subtrees into a single tree, rooted at the root synset entity.n.01"""
156 tree
= Tree((self
.__root
_synset
,), [])
158 found
= set((self
.__root
_synset
,))
159 for a
, b
in combinations(groups
, 2):
160 common
= wn
.synset(a
.node
[-1]).common_hypernyms(wn
.synset(b
.node
[-1]))
162 common_node
= common
[0].name
163 if not common_node
in found
:
164 self
.__merge
(Tree((common_node
,), []), tree
)
165 found
.add(common_node
)
168 self
.__merge
(group
, tree
)