1 # io.py, with sentence positions between word locations (i<k<j)
5 # some of dmv-module bleeding in here... todo: prettier (in inner())
10 def debug(string
, level
='TODO'):
11 '''Easily turn on/off inline debug printouts with this global. There's
12 a lot of cluttering debug statements here, todo: clean up'''
18 '''The PCFG used in the I/O-algorithm.
23 Todo: as of now, this allows duplicate rules... should we check
24 for this? (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
25 inner probabilities of 2.)'''
30 return [rule
for rule
in self
.all_rules() if rule
.LHS() == LHS
]
33 # return self.__numtag
36 return self
.__head
_nums
38 def sent_nums(self
, sent
):
39 return [self
.tagnum(tag
) for tag
in sent
]
41 def numtag(self
, num
):
42 if num
== ROOTNUM
: # don't want these added to headnums (which we iter through)
45 return self
.__numtag
[num
]
47 def tagnum(self
, tag
):
51 return self
.__tagnum
[tag
]
53 def __init__(self
, numtag
, tagnum
, p_rules
=[], p_terminals
=[]):
54 '''rules and p_terminals should be arrays, where p_terminals are of
55 the form [preterminal, terminal], and rules are CNF_Rule's.'''
56 self
.__numtag
= numtag
57 self
.__tagnum
= tagnum
58 self
.__head
_nums
= [k
for k
in numtag
.iterkeys()]
59 self
.__p
_rules
= p_rules
# todo: could check for summing to 1 (+/- epsilon)
60 self
.p_terminals
= p_terminals
66 '''A single CNF rule in the PCFG, of the form
68 where these are just integers
69 (where do we save the connection between number and symbol?
70 symbols being 'vbd' etc.)'''
71 def __eq__(self
, other
):
72 return self
.LHS() == other
.LHS() and self
.R() == other
.R() and self
.L() == other
.L()
73 def __ne__(self
, other
):
74 return self
.LHS() != other
.LHS() or self
.R() != other
.R() or self
.L() != other
.L()
76 return "%s -> %s %s [%.2f]" % (self
.LHS(), self
.L(), self
.R(), self
.prob
)
77 def __init__(self
, LHS
, L
, R
, prob
):
83 "Return a probability, doesn't care about attachment..."
92 def inner(i
, j
, LHS
, g
, sent
, chart
):
93 ''' Give the inner probability of having the node LHS cover whatever's
94 between s and t in sentence sent, using grammar g.
96 Returns a pair of the inner probability and the chart
98 For DMV, LHS is a pair (bar, h), but this function ought to be
101 e() is an internal function, so the variable chart (a dictionary)
102 is available to all calls of e().
104 Since terminal probabilities are just simple lookups, they are not
105 put in the chart (although we could put them in there later to
113 '''Chart has lists of probability and whether or not we've attached
114 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
115 Rattach==True then the rule has a right-attachment or there is one
116 lower in the tree (meaning we're no longer adjacent).'''
117 if (i
, j
, LHS
) in chart
:
118 return chart
[i
, j
, LHS
]
120 debug( "trying from %d to %d with %s" % (i
,j
,LHS
) , "IO")
122 if (LHS
, O(i
,j
)) in g
.p_terminals
:
123 prob
= g
.p_terminals
[LHS
, O(i
,j
)] # b[LHS, O(s)] in L&Y
126 print "\t LACKING TERMINAL:%s -> %s : %.1f" % (LHS
, O(i
,j
), prob
)
127 debug( "\t terminal: %s -> %s : %.1f" % (LHS
, O(i
,j
), prob
) ,"IO")
128 # terminals have no attachment
131 if (i
,j
,LHS
) not in chart
:
132 # by default, not attachment yet
134 for rule
in g
.rules(LHS
): # summing over rules headed by LHS, "a[i,j,k]"
135 debug( "\tsumming rule %s" % rule
, "IO")
138 for k
in range(i
+1, j
): # i<k<j
142 chart
[i
, j
, LHS
] += p
* p_L
* p_R
143 debug( "\tchart[%d,%d,%s] = %.2f" % (i
,j
,LHS
, chart
[i
,j
,LHS
]) ,"IO")
144 return chart
[i
, j
, LHS
]
147 inner_prob
= e(i
,j
,LHS
)
150 for k
,v
in chart
.iteritems():
151 print "\t%s -> %s_%d ... %s_%d : %.1f" % (k
[2], O(k
[0]), k
[0], O(k
[1]), k
[1], v
)
152 print "---CHART:end---"
153 return [inner_prob
, chart
]
162 if __name__
== "__main__":
163 print "IO-module tests:"
165 s
= CNF_Rule(0,1,2, 1.0) # s->np vp
166 np
= CNF_Rule(1,3,4, 0.3) # np->n p
167 b
[1, 'n'] = 0.7 # np->'n'
168 b
[3, 'n'] = 1.0 # n->'n'
169 b
[4, 'p'] = 1.0 # p->'p'
170 vp
= CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
171 vp2
= CNF_Rule(2,2,4, 0.9) # vp->vp p
172 b
[5, 'v'] = 1.0 # v->'v'
174 g
= Grammar({0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'},
175 {'s':0,'np':1,'vp':2,'n':3,'p':4,'v':5},
179 # for i in range(0,5):
180 # for r in g.rules(i):
184 test1
= inner(0,1, 1, g
, ['n'], {})
186 print "should be 0.70 : %.3f" % test1
[0]
189 test2
= inner(0,3, 2, g
, ['v','n','p'], test1
[1])
190 if test2
[0] != 0.0930:
191 print "should be 0.0930 : %.4f" % test2
[0]
192 test2
= inner(0,3, 2, g
, ['v','n','p'], test2
[1])
193 if test2
[0] != 0.0930:
194 print "should be 0.0930 : %.4f" % test2
[0]