src/before_betweens_dmv.py

   1 # before_betweens_dmv.py
   2 #
   3 # dmv reestimation and inside-outside probabilities using loc_h, but
   4 # at-word sentence locations
   5
   6 #import numpy # numpy provides Fast Arrays, for future optimization
   7 import io
   8 from common_dmv import *
   9
  10 if __name__ == "__main__":
  11     print "before_betweens_dmv module tests:"
  12
  13 class DMV_Grammar(io.Grammar):
  14     '''The DMV-PCFG.
  15
  16     Public members:
  17     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  18     These are changed in the Maximation step, then used to set the
  19     new probabilities of each DMV_Rule.
  20
  21     Todo: make p_terminals private? (But it has to be changable in
  22     maximation step due to the short-cutting rules... could of course
  23     make a DMV_Grammar function to update the short-cut rules...)
  24
  25     __p_rules is private, but we can still say stuff like:
  26     for r in g.all_rules():
  27         r.probN = newProbN
  28
  29     What other representations do we need? (P_STOP formula uses
  30     deps_D(h,l/r) at least)'''
  31     def __str__(self):
  32         str = ""
  33         for r in self.all_rules():
  34              str += "%s\n" % r.__str__(self.numtag)
  35         return str
  36
  37     def h_rules(self, h):
  38         return [r for r in self.all_rules() if r.POS() == h]
  39
  40     def mothersL(self, Node, sent_nums, loc_N):
  41         # todo: speed-test with and without sent_nums/loc_N cut-off
  42         return [r for r in self.all_rules() if r.L() == Node
  43                 and (POS(r.R()) in sent_nums[loc_N+1:] or r.R() == STOP)]
  44
  45     def mothersR(self, Node, sent_nums, loc_N):
  46         return [r for r in self.all_rules() if r.R() == Node
  47                 and (POS(r.L()) in sent_nums[:loc_N] or r.L() == STOP)]
  48
  49     def rules(self, LHS):
  50         return [r for r in self.all_rules() if r.LHS() == LHS]
  51
  52     def sent_rules(self, LHS, sent_nums):
  53         '''Used in dmv.inner. Todo: this takes a _lot_ of time, it
  54         seems. Could use some more space and cache some of this
  55         somehow perhaps?'''
  56         # We don't want to rule out STOPs!
  57         nums = sent_nums + [ POS(STOP) ]
  58         return [r for r in self.all_rules() if r.LHS() == LHS
  59                 and POS(r.L()) in nums and POS(r.R()) in nums]
  60
  61     def deps_L(self, head): # todo: do I use this at all?
  62         # todo test, probably this list comprehension doesn't work
  63         return [a for r in self.all_rules() if r.POS() == head and a == r.L()]
  64
  65     def deps_R(self, head):
  66         # todo test, probably this list comprehension doesn't work
  67         return [a for r in self.all_rules() if r.POS() == head and a == r.R()]
  68
  69     def __init__(self, numtag, tagnum, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT):
  70         io.Grammar.__init__(self, numtag, tagnum, p_rules, p_terminals)
  71         self.p_STOP = p_STOP
  72         self.p_CHOOSE = p_CHOOSE
  73         self.p_ROOT = p_ROOT
  74         self.head_nums = [k for k in numtag.iterkeys()]
  75
  76
  77 class DMV_Rule(io.CNF_Rule):
  78     '''A single CNF rule in the PCFG, of the form
  79     LHS -> L R
  80     where LHS, L and R are 'nodes', eg. of the form (seals, head).
  81
  82     Public members:
  83     probN, probA
  84
  85     Private members:
  86     __L, __R, __LHS
  87
  88     Different rule-types have different probabilities associated with
  89     them:
  90
  91     _h_ -> STOP  h_     P( STOP|h,L,    adj)
  92     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
  93      h_ ->  h  STOP     P( STOP|h,R,    adj)
  94      h_ ->  h  STOP     P( STOP|h,R,non_adj)
  95      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
  96      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
  97      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
  98      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
  99     '''
 100     def p(self, adj, *arg):
 101         if adj:
 102             return self.probA
 103         else:
 104             return self.probN
 105
 106     def adj(middle, loc_h):
 107         "middle is eg. k when rewriting for i<k<j (inside probabilities)."
 108         return middle == loc_h[0] or middle == loc_h[1]
 109
 110     def p_STOP(self, s, t, loc_h):
 111         '''Returns the correct probability, adjacent if we're rewriting from
 112         the (either left or right) end of the fragment.
 113         '''
 114         if self.L() == STOP:
 115             return self.p(s == loc_h)
 116         elif self.R() == STOP:
 117             if not loc_h == s:
 118                 if 'TODO' in DEBUG:
 119                     print "(%s given loc_h:%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
 120                 return 0.0
 121             else:
 122                 return self.p(t == loc_h)
 123
 124     def p_ATTACH(self, r, loc_h, s=None):
 125         '''Returns the correct probability, adjacent if we haven't attached
 126         anything before.
 127         (This is actually p_choose*(1-p_stop).)'''
 128         if self.LHS() == self.L():
 129             if s and not loc_h == s:
 130                 if 'TODO' in DEBUG:
 131                     print "(%s given loc_h (loc_L):%d but s:%d. Todo: optimize away!)" % (self, loc_h, s)
 132                 return 0.0
 133             else:
 134                 return self.p(r == loc_h)
 135         elif self.LHS() == self.R():
 136             return self.p(r+1 == loc_h)
 137
 138     def seals(self):
 139         return seals(self.LHS())
 140
 141     def POS(self):
 142         return POS(self.LHS())
 143
 144     def __init__(self, LHS, L, R, probN, probA):
 145         for b_h in [LHS, L, R]:
 146             if seals(b_h) not in SEALS:
 147                 raise ValueError("seals must be in %s; was given: %s"
 148                                  % (SEALS, seals(b_h)))
 149         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 150         self.probA = probA # adjacent
 151         self.probN = probN # non_adj
 152
 153     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 154     def bar_str(cls, b_h, tag=lambda x:x):
 155         if(b_h == ROOT):
 156             return 'ROOT'
 157         elif(b_h == STOP):
 158             return 'STOP'
 159         elif(seals(b_h) == RGOL):
 160             return " %s_ " % tag(POS(b_h))
 161         elif(seals(b_h) == SEAL):
 162             return "_%s_ " % tag(POS(b_h))
 163         else:
 164             return " %s  " % tag(POS(b_h))
 165
 166
 167     def __str__(self, tag=lambda x:x):
 168         return "%s-->%s %s\t[N %.2f] [A %.2f]" % (self.bar_str(self.LHS(), tag),
 169                                                   self.bar_str(self.L(), tag),
 170                                                   self.bar_str(self.R(), tag),
 171                                                   self.probN,
 172                                                   self.probA)
 173
 174
 175
 176
 177
 178
 179
 180 ###################################
 181 # dmv-specific version of inner() #
 182 ###################################
 183 def locs(h, sent, s=0, t=None, remove=None):
 184     '''Return the locations of h in sent, or some fragment of sent (in the
 185     latter case we make sure to offset the locations correctly so that
 186     for any x in the returned list, sent[x]==h).
 187
 188     t is inclusive, to match the way indices work with inner()
 189     (although python list-splicing has "exclusive" end indices)'''
 190     if t == None:
 191         t = len(sent)-1
 192     return [i+s for i,w in enumerate(sent[s:t+1])
 193             if w == h and not (i+s) == remove]
 194
 195
 196 def inner(s, t, LHS, loc_h, g, sent, ichart={}):
 197     ''' A rewrite of io.inner(), to take adjacency into accord.
 198
 199     The ichart is now of this form:
 200     ichart[s,t,LHS, loc_h]
 201
 202     loc_h gives adjacency (along with r and location of other child
 203     for attachment rules), and is needed in P_STOP reestimation.
 204
 205     Todo: if possible, refactor (move dmv-specific stuff back into
 206     dmv, so this is "general" enough to be in io.py)
 207     '''
 208
 209     def O(s):
 210         return sent[s]
 211
 212     sent_nums = g.sent_nums(sent)
 213
 214     def e(s,t,LHS, loc_h, n_t):
 215         def tab():
 216             "Tabs for debug output"
 217             return "\t"*n_t
 218
 219         if (s, t, LHS, loc_h) in ichart:
 220             if 'INNER' in DEBUG:
 221                 print "%s*= %.4f in ichart: s:%d t:%d LHS:%s loc:%d" % (tab(),ichart[s, t, LHS, loc_h], s, t,
 222                                                                        DMV_Rule.bar_str(LHS), loc_h)
 223             return ichart[s, t, LHS, loc_h]
 224         else:
 225             if s == t and seals(LHS) == GOR:
 226                 if not loc_h == s:
 227                     if 'INNER' in DEBUG:
 228                         print "%s*= 0.0 (wrong loc_h)" % tab()
 229                     return 0.0
 230                 elif (LHS, O(s)) in g.p_terminals:
 231                     prob = g.p_terminals[LHS, O(s)] # "b[LHS, O(s)]" in Lari&Young
 232                 else:
 233                     # todo: assuming this is how to deal w/lacking
 234                     # rules, since we add prob.s, and 0 is identity
 235                     prob = 0.0
 236                     if 'INNER' in DEBUG:
 237                         print "%sLACKING TERMINAL:" % tab()
 238                 # todo: add to ichart perhaps? Although, it _is_ simple lookup..
 239                 if 'INNER' in DEBUG:
 240                     print "%s*= %.4f (terminal: %s -> %s_%d)" % (tab(),prob, DMV_Rule.bar_str(LHS), O(s), loc_h)
 241                 return prob
 242             else:
 243                 p = 0.0 # "sum over j,k in a[LHS,j,k]"
 244                 for rule in g.sent_rules(LHS, sent_nums):
 245                     if 'INNER' in DEBUG:
 246                         print "%ssumming rule %s s:%d t:%d loc:%d" % (tab(),rule,s,t,loc_h)
 247                     L = rule.L()
 248                     R = rule.R()
 249                     if loc_h == t and LHS == L:
 250                         continue # todo: speed-test
 251                     if loc_h == s and LHS == R:
 252                         continue
 253                     # if it's a STOP rule, rewrite for the same xrange:
 254                     if (L == STOP) or (R == STOP):
 255                         if L == STOP:
 256                             pLR = e(s, t, R, loc_h, n_t+1)
 257                         elif R == STOP:
 258                             pLR = e(s, t, L, loc_h, n_t+1)
 259                         p += rule.p_STOP(s, t, loc_h) * pLR
 260                         if 'INNER' in DEBUG:
 261                             print "%sp= %.4f (STOP)" % (tab(), p)
 262
 263                     elif t > s: # not a STOP, attachment rewrite:
 264                         rp_ATTACH = rule.p_ATTACH # todo: profile/speedtest
 265                         for r in xrange(s, t):
 266                             p_h = rp_ATTACH(r, loc_h, s=s)
 267                             if LHS == L:
 268                                 locs_L = [loc_h]
 269                                 locs_R = locs(POS(R), sent_nums, r+1, t, loc_h)
 270                             elif LHS == R:
 271                                 locs_L = locs(POS(L), sent_nums,  s,  r, loc_h)
 272                                 locs_R = [loc_h]
 273                             for loc_L in locs_L:
 274                                 pL = e(s, r, L, loc_L, n_t+1)
 275                                 if pL > 0.0:
 276                                     for loc_R in locs_R:
 277                                         pR = e(r+1, t, R, loc_R, n_t+1)
 278                                         p += pL * p_h * pR
 279                             if 'INNER' in DEBUG:
 280                                 print "%sp= %.4f (ATTACH)" % (tab(), p)
 281                 ichart[s, t, LHS, loc_h] = p
 282                 return p
 283     # end of e-function
 284
 285     inner_prob = e(s,t,LHS,loc_h, 0)
 286     if 'INNER' in DEBUG:
 287         print debug_ichart(g,sent,ichart)
 288     return inner_prob
 289 # end of dmv.inner(s, t, LHS, loc_h, g, sent, ichart={})
 290
 291
 292 def debug_ichart(g,sent,ichart):
 293     str = "---ICHART:---\n"
 294     for (s,t,LHS,loc_h),v in ichart.iteritems():
 295         if type(v) == dict: # skip 'tree'
 296             continue
 297         str += "%s -> %s_%d ... %s_%d (loc_h:%s):\t%.4f\n" % (DMV_Rule.bar_str(LHS,g.numtag),
 298                                                               sent[s], s, sent[s], t, loc_h, v)
 299     str += "---ICHART:end---\n"
 300     return str
 301
 302
 303 def inner_sent(g, sent, ichart={}):
 304     return sum([inner(0, len(sent)-1, ROOT, loc_h, g, sent, ichart)
 305                 for loc_h in xrange(len(sent))])
 306
 307
 308 ###################################
 309 # dmv-specific version of outer() #
 310 ###################################
 311 def outer(s,t,Node,loc_N, g, sent, ichart={}, ochart={}):
 312     ''' http://www.student.uib.no/~kun041/dmvccm/DMVCCM.html#outer
 313     '''
 314     def e(s,t,LHS,loc_h):
 315         # or we could just look it up in ichart, assuming ichart to be done
 316         return inner(s, t, LHS, loc_h, g, sent, ichart)
 317
 318     T = len(sent)-1
 319     sent_nums = g.sent_nums(sent)
 320
 321     def f(s,t,Node,loc_N):
 322         if (s,t,Node,loc_N) in ochart:
 323             return ochart[(s, t, Node,loc_N)]
 324         if Node == ROOT:
 325             if s == 0 and t == T:
 326                 return 1.0
 327             else: # ROOT may only be used on full sentence
 328                 return 0.0 # but we may have non-ROOTs over full sentence too
 329         p = 0.0
 330
 331         for mom in g.mothersL(Node, sent_nums, loc_N): # mom.L() == Node
 332             R = mom.R()
 333             mLHS = mom.LHS()
 334             if R == STOP:
 335                 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
 336             else:
 337                 if seals(mLHS) == RGOL: # left attachment, POS(mLHS) == POS(R)
 338                     for r in xrange(t+1,T+1): # t+1 to lasT
 339                         for loc_m in locs(POS(mLHS),sent_nums,t+1,r):
 340                             p_m = mom.p(t+1 == loc_m)
 341                             p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_m)
 342                 elif seals(mLHS) == GOR: # right attachment, POS(mLHS) == POS(Node)
 343                     loc_m = loc_N
 344                     p_m = mom.p( t  == loc_m)
 345                     for r in xrange(t+1,T+1): # t+1 to lasT
 346                         for loc_R in locs(POS(R),sent_nums,t+1,r):
 347                             p += f(s,r,mLHS,loc_m) * p_m * e(t+1,r,R,loc_R)
 348
 349         for mom in g.mothersR(Node, sent_nums, loc_N): # mom.R() == Node
 350             L = mom.L()
 351             mLHS = mom.LHS()
 352             if L == STOP:
 353                 p += f(s,t,mLHS,loc_N) * mom.p_STOP(s,t,loc_N) # == loc_m
 354             else:
 355                 if seals(mLHS) == RGOL: # left attachment, POS(mLHS) == POS(Node)
 356                     loc_m = loc_N
 357                     p_m = mom.p( s  == loc_m)
 358                     for r in xrange(0,s): # first to s-1
 359                         for loc_L in locs(POS(L),sent_nums,r,s-1):
 360                             p += e(r,s-1,L, loc_L) * p_m * f(r,t,mLHS,loc_m)
 361                 elif seals(mLHS) == GOR: # right attachment, POS(mLHS) == POS(L)
 362                     for r in xrange(0,s): # first to s-1
 363                         for loc_m in locs(POS(mLHS),sent_nums,r,s-1):
 364                             p_m = mom.p(s-1 == loc_m)
 365                             p += e(r,s-1,L, loc_m) * p_m * f(r,t,mLHS,loc_m)
 366         ochart[s,t,Node,loc_N] = p
 367         return p
 368
 369
 370     return f(s,t,Node,loc_N)
 371 # end outer(s,t,Node,loc_N, g,sent, ichart,ochart)
 372
 373
 374
 375 ##############################
 376 #      reestimation, todo:   #
 377 ##############################
 378 ## using local version instead
 379 # def c(s,t,LHS,loc_h,g,sent,ichart={},ochart={}):
 380 #     # assuming P_sent = P(D(ROOT)) = inner(sent). todo: check K&M about this
 381 #     p_sent = inner_sent(g, sent, ichart)
 382 #     p_in = inner(s,t,LHS,loc_h,g,sent,ichart)
 383 #     p_out = outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
 384 #     if p_sent > 0.0:
 385 #         return p_in * p_out / p_sent
 386 #     else:
 387 #         return p_sent
 388
 389 def reest_zeros(h_nums):
 390     # todo: p_ROOT? ... p_terminals?
 391     f = {}
 392     for h in h_nums:
 393         for stop in ['LNSTOP','LASTOP','RNSTOP','RASTOP']:
 394             for nd in ['num','den']:
 395                 f[stop,nd,h] = 0.0
 396         for choice in ['RCHOOSE', 'LCHOOSE']:
 397             f[choice,'den',h] = 0.0
 398     return f
 399
 400 def reest_freq(g, corpus):
 401     ''' P_STOP(-STOP|...) = 1 - P_STOP(STOP|...) '''
 402     f = reest_zeros(g.head_nums)
 403     ichart = {}
 404     ochart = {}
 405
 406     p_sent = None # 50 % speed increase on storing this locally
 407     def c_g(s,t,LHS,loc_h,sent): # altogether 2x faster than the global c()
 408         if (s,t,LHS,loc_h) in ichart:
 409             p_in = ichart[s,t,LHS,loc_h]
 410         else:
 411             p_in = inner(s,t,LHS,loc_h,g,sent,ichart)
 412         if (s,t,LHS,loc_h) in ochart:
 413             p_out = ochart[s,t,LHS,loc_h]
 414         else:
 415             p_out = outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
 416
 417         if p_sent > 0.0:
 418             return p_in * p_out / p_sent
 419         else:
 420             return p_sent
 421
 422     def w_g(s,t,a,loc_a,LHS,loc_h,sent):
 423         "Todo: should sum through all r in between s and t in sent(_nums)"
 424         h = POS(LHS)
 425         b_h = seals(LHS)
 426         if b_h == GOR:
 427             return e_L * e_R * f_g(s,t,(GOR, h), loc_h, sent) * p_g(r,(GOR, h), (GOR, h), (SEAL, a), loc_h, sent_nums)
 428         if b_h == RGOL:
 429             return e_L * e_R * f_g(s,t,(RGOL, h), loc_h, sent) * p_g(r,(RGOL, h),(SEAL, a),(RGOL, h),loc_h,sent_nums)
 430
 431     def f_g(s,t,LHS,loc_h,sent): # todo: test with choose rules
 432         if (s,t,LHS,loc_h) in ochart:
 433             return ochart[s,t,LHS,loc_h]
 434         else:
 435             return outer(s,t,LHS,loc_h,g,sent,ichart,ochart)
 436
 437     def e_g(s,t,LHS,loc_h,sent): # todo: test with choose rules
 438         if (s,t,LHS,loc_h) in ichart:
 439             return ichart[s,t,LHS,loc_h]
 440         else:
 441             return inner(s,t,LHS,loc_h,g,sent,ichart)
 442
 443     def p_g(r,LHS,L,R,loc_h,sent):
 444         rules = [rule for rule in g.sent_rules(LHS, sent)
 445                  if rule.L() == L and rule.R() == R]
 446         rule = rules[0]
 447         if len(rules) > 1:
 448             raise Exception("Several rules matching a[i,j,k]")
 449         return rule.p_ATTACH(r,loc_h)
 450
 451     for sent in corpus:
 452         if 'reest' in DEBUG:
 453             print sent
 454         ichart = {}
 455         ochart = {}
 456         p_sent = inner_sent(g, sent, ichart)
 457
 458         sent_nums = g.sent_nums(sent)
 459         # todo: use sum([ichart[s, t...] etc? but can we then
 460         # keep den and num separate within _one_ sum()-call?
 461         for loc_h,h in enumerate(sent_nums):
 462             for t in xrange(loc_h, len(sent)):
 463                 for s in xrange(loc_h): # s<loc(h), xrange gives strictly less
 464                     # left non-adjacent stop:
 465                     f['LNSTOP','num',h] += c_g(s, t, (SEAL, h), loc_h,sent)
 466                     f['LNSTOP','den',h] += c_g(s, t, (RGOL,h), loc_h,sent)
 467                 # left adjacent stop:
 468                 f['LASTOP','num',h] += c_g(loc_h, t, (SEAL, h), loc_h,sent)
 469                 f['LASTOP','den',h] += c_g(loc_h, t, (RGOL,h), loc_h,sent)
 470             for t in xrange(loc_h+1, len(sent)):
 471                 # right non-adjacent stop:
 472                 f['RNSTOP','num',h] += c_g(loc_h, t, (RGOL,h), loc_h,sent)
 473                 f['RNSTOP','den',h] += c_g(loc_h, t, (GOR, h), loc_h,sent)
 474             # right adjacent stop:
 475             f['RASTOP','num',h] += c_g(loc_h, loc_h, (RGOL,h), loc_h,sent)
 476             f['RASTOP','den',h] += c_g(loc_h, loc_h, (GOR, h), loc_h,sent)
 477
 478             # right attachment:  TODO: try with p*e*e*f instead of c, for numerator
 479             if 'reest_attach' in DEBUG:
 480                 print "Rattach %s: for t in %s"%(g.numtag(h),sent[loc_h+1:len(sent)])
 481             for t in xrange(loc_h+1, len(sent)):
 482                 cM = c_g(loc_h,t,(GOR, h), loc_h, sent) # v_q in L&Y
 483                 f['RCHOOSE','den',h] += cM
 484                 if 'reest_attach' in DEBUG:
 485                     print "\tc_g( %d , %d, %s, %s, sent)=%.4f"%(loc_h,t,g.numtag(h),loc_h,cM)
 486                 args = {} # for summing w_q's in L&Y, without 1/P_q
 487                 for r in xrange(loc_h+1, t+1): # loc_h < r <= t
 488                     e_L = e_g(loc_h, r-1, (GOR, h), loc_h, sent)
 489                     if 'reest_attach' in DEBUG:
 490                         print "\t\te_g( %d , %d, %s, %d, sent)=%.4f"%(loc_h,r-1,g.numtag(h),loc_h,e_L)
 491                     for i,a in enumerate(sent_nums[r:t+1]):
 492                         loc_a = i+r
 493                         e_R = e_g(r, t, (SEAL, a), loc_a, sent)
 494                         if a not in args:
 495                             args[a] = 0.0
 496                         args[a] += e_L * e_R * f_g(loc_h,t,(GOR, h), loc_h, sent) * p_g(r,(GOR, h), (GOR, h), (SEAL, a), loc_h, sent_nums)
 497                     for a,sum_a in args.iteritems():
 498                         f['RCHOOSE','num',h,a] = sum_a / p_sent
 499
 500
 501             # left attachment:
 502             if 'reest_attach' in DEBUG:
 503                 print "Lattach %s: for s in %s"%(g.numtag(h),sent[0:loc_h])
 504             for s in xrange(0, loc_h):
 505                 if 'reest_attach' in DEBUG:
 506                     print "\tfor t in %s"%sent[loc_h:len(sent)]
 507                 for t in xrange(loc_h, len(sent)):
 508                     c_M = c_g(s,t,(RGOL, h), loc_h, sent) # v_q in L&Y
 509                     f['LCHOOSE','den',h] += c_M
 510                     if 'reest_attach' in DEBUG:
 511                         print "\t\tc_g( %d , %d, %s_, %s, sent)=%.4f"%(s,t,g.numtag(h),loc_h,c_M)
 512                     if 'reest_attach' in DEBUG:
 513                         print "\t\tfor r in %s"%(sent[s:loc_h])
 514                     args = {} # for summing w_q's in L&Y, without 1/P_q
 515                     for r in xrange(s, loc_h): # s <= r < loc_h <= t
 516                         e_R = e_g(r+1, t, (RGOL, h), loc_h, sent)
 517                         if 'reest_attach' in DEBUG:
 518                             print "\t\te_g( %d , %d, %s_, %d, sent)=%.4f"%(r+1,t,g.numtag(h),loc_h,e_R)
 519                         for i,a in enumerate(sent_nums[s:r+1]):
 520                             loc_a = i+s
 521                             e_L = e_g( s , r, (SEAL, a), loc_a, sent)
 522                             if a not in args:
 523                                 args[a] = 0.0
 524                             args[a] += e_L * e_R * f_g(s,t,(RGOL, h), loc_h, sent) * p_g(r,(RGOL, h),(SEAL, a),(RGOL, h),loc_h,sent_nums)
 525                     for a,sum_a in args.iteritems():
 526                         f['LCHOOSE', 'num',h,a] = sum_a / p_sent
 527     return f
 528
 529 def reestimate(g, corpus):
 530     ""
 531     f = reest_freq(g, corpus)
 532     # we want to go through only non-ROOT left-STOPs..
 533     for r in g.all_rules():
 534         reest_rule(r,f, g)
 535     return f
 536
 537
 538 def reest_rule(r,f, g): # g just for numtag / debug output, remove eventually?
 539     "remove 0-prob rules? todo"
 540     h = r.POS()
 541     if r.LHS() == ROOT:
 542         return None # not sure what todo yet here
 543     if r.L() == STOP or POS(r.R()) == h:
 544         dir = 'L'
 545     elif r.R() == STOP or POS(r.L()) == h:
 546         dir = 'R'
 547     else:
 548         raise Exception("Odd rule in reestimation.")
 549
 550     p_stopN = f[dir+'NSTOP','den',h]
 551     if p_stopN > 0.0:
 552         p_stopN = f[dir+'NSTOP','num',h] / p_stopN
 553
 554     p_stopA = f[dir+'ASTOP','den',h]
 555     if p_stopA > 0.0:
 556         p_stopA = f[dir+'ASTOP','num',h] / p_stopA
 557
 558     if r.L() == STOP or r.R() == STOP: # stop rules
 559         if 'reest' in DEBUG:
 560             print "p(STOP|%d=%s,%s,N): %.4f (was: %.4f)"%(h,g.numtag(h),dir, p_stopN, r.probN)
 561             print "p(STOP|%d=%s,%s,A): %.4f (was: %.4f)"%(h,g.numtag(h),dir, p_stopA, r.probA)
 562         r.probN = p_stopN
 563         r.probA = p_stopA
 564
 565     else: # attachment rules
 566         pchoose = f[dir+'CHOOSE','den',h]
 567         if pchoose > 0.0:
 568             if POS(r.R()) == h: # left attachment
 569                 a = POS(r.L())
 570             elif POS(r.L()) == h: # right attachment
 571                 a = POS(r.R())
 572             pchoose = f[dir+'CHOOSE','num',h,a] / pchoose
 573             r.probN = (1-p_stopN) * pchoose
 574             r.probA = (1-p_stopA) * pchoose
 575             if 'reest' in DEBUG:
 576                 print "p(%d=%s|%d=%s,%s): %.4f,\tprobN: %.4f, probA: %.4f"%(a,g.numtag(a),h,g.numtag(h),dir, pchoose,r.probN,r.probA)
 577
 578
 579
 580
 581
 582
 583
 584 ##############################
 585 #     testing functions:     #
 586 ##############################
 587
 588 testcorpus = [s.split() for s in ['det nn vbd c vbd','vbd nn c vbd',
 589                                   'det nn vbd',      'det nn vbd c pp',
 590                                   'det nn vbd',      'det vbd vbd c pp',
 591                                   'det nn vbd',      'det nn vbd c vbd',
 592                                   'det nn vbd',      'det nn vbd c vbd',
 593                                   'det nn vbd',      'det nn vbd c vbd',
 594                                   'det nn vbd',      'det nn vbd c pp',
 595                                   'det nn vbd pp',   'det nn vbd', ]]
 596
 597 def testgrammar():
 598     import before_betweens_harmonic
 599     reload(before_betweens_harmonic)
 600     return before_betweens_harmonic.initialize(testcorpus)
 601
 602 def testreestimation():
 603     g = testgrammar()
 604     f = reestimate(g, testcorpus)
 605     f_stops = {('LNSTOP', 'den', 3): 12.212773236178391, ('RASTOP', 'den', 2): 4.0, ('RNSTOP', 'num', 4): 2.5553487221351365, ('LNSTOP', 'den', 2): 1.274904052793207, ('LASTOP', 'num', 1): 14.999999999999995, ('RASTOP', 'den', 3): 15.0, ('LASTOP', 'num', 4): 16.65701084787457, ('LASTOP', 'num', 0): 4.1600647714443468, ('LNSTOP', 'den', 4): 6.0170669155897105, ('LASTOP', 'num', 3): 2.7872267638216113, ('LASTOP', 'num', 2): 2.9723139990470515, ('LASTOP', 'den', 2): 4.0, ('RNSTOP', 'den', 3): 12.945787931730905, ('LASTOP', 'den', 3): 14.999999999999996, ('RNSTOP', 'den', 2): 0.0, ('LASTOP', 'den', 0): 8.0, ('RASTOP', 'num', 4): 19.44465127786486, ('RNSTOP', 'den', 1): 3.1966410324085777, ('LASTOP', 'den', 1): 14.999999999999995, ('RASTOP', 'num', 3): 4.1061665495365558, ('RNSTOP', 'den', 0): 4.8282499043902476, ('LNSTOP', 'num', 4): 5.3429891521254289, ('RASTOP', 'num', 2): 4.0, ('LASTOP', 'den', 4): 22.0, ('RASTOP', 'num', 1): 12.400273895299103, ('LNSTOP', 'num', 2): 1.0276860009529487, ('RASTOP', 'num', 0): 3.1717500956097533, ('LNSTOP', 'num', 3): 12.212773236178391, ('RASTOP', 'den', 4): 22.0, ('RNSTOP', 'den', 4): 2.8705211946979836, ('LNSTOP', 'num', 0): 3.8399352285556518, ('LNSTOP', 'num', 1): 0.0, ('RNSTOP', 'num', 0): 4.8282499043902476, ('RNSTOP', 'num', 1): 2.5997261047008959, ('LNSTOP', 'den', 1): 0.0, ('RASTOP', 'den', 0): 8.0, ('RNSTOP', 'num', 2): 0.0, ('LNSTOP', 'den', 0): 4.6540557322109795, ('RASTOP', 'den', 1): 15.0, ('RNSTOP', 'num', 3): 10.893833450463443}
 606     for k,v in f_stops.iteritems():
 607         if not k in f:
 608             print '''Regression!(?) Something changed in the P_STOP reestimation,
 609 expected f[%s]=%.4f, but %s not in f'''%(k,v,k)
 610             pass
 611         elif not "%.10f"%f[k] == "%.10f"%v:
 612             print '''Regression!(?) Something changed in the P_STOP reestimation,
 613 expected f[%s]=%.4f, got f[%s]=%.4f.'''%(k,v,k,f[k])
 614             pass
 615
 616
 617 def testgrammar_a():                            # Non, Adj
 618     _h_ = DMV_Rule((SEAL,0), STOP,    ( RGOL,0), 1.0, 1.0) # LSTOP
 619     h_S = DMV_Rule(( RGOL,0),(GOR,0),  STOP,    0.4, 0.3) # RSTOP
 620     h_A = DMV_Rule(( RGOL,0),(SEAL,0),( RGOL,0),0.2, 0.1) # Lattach
 621     h_Aa= DMV_Rule(( RGOL,0),(SEAL,1),( RGOL,0),0.4, 0.6) # Lattach to a
 622     h   = DMV_Rule((GOR,0),(GOR,0),(SEAL,0),    1.0, 1.0) # Rattach
 623     ha  = DMV_Rule((GOR,0),(GOR,0),(SEAL,1),    1.0, 1.0) # Rattach to a
 624     rh  = DMV_Rule(   ROOT,   STOP,    (SEAL,0),  0.9, 0.9) # ROOT
 625
 626     _a_ = DMV_Rule((SEAL,1), STOP,    ( RGOL,1), 1.0, 1.0) # LSTOP
 627     a_S = DMV_Rule(( RGOL,1),(GOR,1),  STOP,    0.4, 0.3) # RSTOP
 628     a_A = DMV_Rule(( RGOL,1),(SEAL,1),( RGOL,1),0.4, 0.6) # Lattach
 629     a_Ah= DMV_Rule(( RGOL,1),(SEAL,0),( RGOL,1),0.2, 0.1) # Lattach to h
 630     a   = DMV_Rule((GOR,1),(GOR,1),(SEAL,1),    1.0, 1.0) # Rattach
 631     ah  = DMV_Rule((GOR,1),(GOR,1),(SEAL,0),    1.0, 1.0) # Rattach to h
 632     ra  = DMV_Rule(   ROOT,   STOP,    (SEAL,1),  0.1, 0.1) # ROOT
 633
 634     b2  = {}
 635     b2[(GOR, 0), 'h'] = 1.0
 636     b2[(GOR, 1), 'a'] = 1.0
 637
 638     return DMV_Grammar({0:'h',1:'a'}, {'h':0,'a':1}, [ h_Aa, ha, a_Ah, ah, ra, _a_, a_S, a_A, a, rh, _h_, h_S, h_A, h ],b2,0,0,0)
 639 def oa(s,t,LHS,loc_h):
 640     return outer(s,t,LHS,loc_h,testgrammar_a(),'h a'.split())
 641 def ia(s,t,LHS,loc_h):
 642     return inner(s,t,LHS,loc_h,testgrammar_a(),'h a'.split())
 643 def ca(s,t,LHS,loc_h):
 644     return c(s,t,LHS,loc_h,testgrammar_a(),'h a'.split())
 645
 646 def testgrammar_h():                            # Non, Adj
 647     _h_ = DMV_Rule((SEAL,0), STOP,    ( RGOL,0), 1.0, 1.0) # LSTOP
 648     h_S = DMV_Rule(( RGOL,0),(GOR,0),  STOP,    0.4, 0.3) # RSTOP
 649     h_A = DMV_Rule(( RGOL,0),(SEAL,0),( RGOL,0), 0.6, 0.7) # Lattach
 650     h   = DMV_Rule((GOR,0),(GOR,0),(SEAL,0), 1.0, 1.0) # Rattach
 651     rh  = DMV_Rule(   ROOT,   STOP,    (SEAL,0), 1.0, 1.0) # ROOT
 652     b2  = {}
 653     b2[(GOR, 0), 'h'] = 1.0
 654
 655     return DMV_Grammar({0:'h'}, {'h':0}, [ rh, _h_, h_S, h_A, h ],b2,0,0,0)
 656
 657
 658 def testreestimation_h():
 659     g = testgrammar_h()
 660     reestimate(g,['h h h'.split()])
 661
 662
 663 def regression_tests():
 664     def test(wanted, got):
 665         if not wanted == got:
 666             print "Regression! Should be %s: %s" % (wanted, got)
 667
 668     g_dup = testgrammar_h()
 669
 670     test("0.120",
 671          "%.3f" % inner(0, 1, (SEAL,0), 0, g_dup, 'h h'.split(), {}))
 672
 673     test("0.063",
 674          "%.3f" % inner(0, 1, (SEAL,0), 1, g_dup, 'h h'.split(), {}))
 675
 676     test("0.0498",
 677          "%.4f" % inner(0, 2, (SEAL,0), 2, g_dup, 'h h h'.split(), {}))
 678
 679     test("0.58" ,
 680          "%.2f" % outer(1,2,(1,0),2,testgrammar_h(),'h h h'.split(),{},{}))
 681
 682     test("0.1089" ,
 683          "%.4f" % outer(0,0,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
 684     test("0.3600" ,
 685          "%.4f" % outer(0,1,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
 686     test("0.0000" ,
 687          "%.4f" % outer(0,2,(0,0),0,testgrammar_a(),'h a'.split(),{},{}))
 688
 689
 690 if __name__ == "__main__":
 691     DEBUG.clear()
 692 if __name__ == "__main__":
 693     regression_tests()
 694     testreestimation()
 695
 696 def testIO():
 697     g = testgrammar()
 698     inners = [(sent, inner_sent(g, sent, {})) for sent in testcorpus]
 699     return inners