src/dmv.py.~18~

   1 #### changes by KBU:
   2 # 2008-05-24:
   3 # - prettier printout for DMV_Rule
   4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
   5 #   form (bars, head).
   6 # - Started on P_STOP, a bit less pseudo now..
   7 #
   8 # 2008-05-27:
   9 # - started on initialization. So far, I have frequencies for
  10 #   everything, very harmonic. Still need to make these into 1-summing
  11 #   probabilities
  12 #
  13 # 2008-05-28:
  14 # - more work on initialization (init_freq and init_normalize),
  15 #   getting closer to probabilities now.
  16 #
  17 # 2008-05-29:
  18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
  19 #   and also adds the relevant probabilities to p_rules in a grammar.
  20 #   Still, each individual rule has to store both adjacent and non_adj
  21 #   probabilities, and inner() should be able to send some parameter
  22 #   which lets the rule choose... hopefully... Is this possible to do
  23 #   top-down even? when the sentence could be all the same words?
  24 #   todo: extensive testing of identical words in sentences!
  25 # - frequencies (only used in initialization) are stored as strings,
  26 #   but in the rules and p_STOP etc, there are only numbers.
  27 #
  28 # 2008-05-30
  29 # - copied inner() into this file, to make the very dmv-specific
  30 #   adjacency stuff work (have to factor that out later on, when it
  31 #   works).
  32 #
  33 # 2008-06-01
  34 # - finished typing in inner_dmv(), still have to test and debug
  35 #   it. The chart is now four times as big since for any rule we may
  36 #   have attachments to either the left or the right below, which
  37 #   upper rules depend on, for selecting probN or probA
  38
  39
  40 # import numpy # numpy provides Fast Arrays, for future optimization
  41 import pprint
  42 import io
  43
  44 # todo: tweak this
  45 HARMONIC_C = 0.0
  46
  47
  48 # non-tweakable/constant "lookup" globals
  49 BARS = [0,1,2]
  50 RBAR = 1
  51 LRBAR = 2
  52 NOBAR = 0
  53 ROOT = (LRBAR, -1)
  54 STOP = (NOBAR, -2)
  55 # (is this the best way to represent ROOT and STOP?)
  56
  57 # todo: use these instead for attachment constants. Requires making
  58 # the last two indices of chart[] one single pair, boring retyping and
  59 # testing, worth it?
  60 LRATT = (True,   True)
  61 LATT =  (True,  False)
  62 RATT =  (False,  True)
  63 NOATT = (False, False)
  64
  65 if __name__ == "__main__":
  66     print "DMV module tests:"
  67
  68 class DMV_Grammar(io.Grammar):
  69     '''The DMV-PCFG.
  70
  71     Public members:
  72     p_STOP, p_ROOT, p_CHOOSE, p_terminals
  73     These are changed in the Maximation step, then used to set the
  74     new probabilities of each DMV_Rule.
  75
  76     Todo: make p_terminals private? (But it has to be changable in
  77     maximation step due to the short-cutting rules... could of course
  78     make a DMV_Grammar function to update the short-cut rules...)
  79
  80     __p_rules is private, but we can still say stuff like:
  81     for r in g.all_rules():
  82         r.probN = newProbN
  83
  84     What other representations do we need? (P_STOP formula uses
  85     deps_D(h,l/r) at least)'''
  86     def __str__(self):
  87         str = ""
  88         for r in self.all_rules():
  89              str += "%s\n" % r
  90         return str
  91
  92     def rules(self, b_h):
  93         bars = b_h[0]
  94         head = b_h[1]
  95         return [r for r in self.all_rules() if r.head() == head and r.bars() == bars]
  96
  97     def heads(self):
  98         '''Not sure yet what is needed here, or where this is needed'''
  99         return numtag
 100
 101     def deps_L(self, head):
 102         # todo test, probably this list comprehension doesn't work
 103         return [a for r in self.all_rules() if r.head() == head and a == r.L()]
 104
 105     def deps_R(self, head):
 106         # todo test, probably this list comprehension doesn't work
 107         return [a for r in self.all_rules() if r.head() == head and a == r.R()]
 108
 109     def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag):
 110         io.Grammar.__init__(self, p_rules, p_terminals, numtag)
 111         self.p_STOP = p_STOP
 112         self.p_CHOOSE = p_CHOOSE
 113         self.p_ROOT = p_ROOT
 114
 115
 116 class DMV_Rule(io.CNF_Rule):
 117     '''A single CNF rule in the PCFG, of the form
 118     LHS -> L R
 119     where LHS = (bars, head)
 120
 121     Public members:
 122     probN, probA
 123
 124     Private members:
 125     __L, __R, __LHS
 126
 127     Different rule-types have different probabilities associated with
 128     them:
 129
 130     _h_ -> STOP  h_     P( STOP|h,L,    adj)
 131     _h_ -> STOP  h_     P( STOP|h,L,non_adj)
 132      h_ ->  h  STOP     P( STOP|h,R,    adj)
 133      h_ ->  h  STOP     P( STOP|h,R,non_adj)
 134      h_ -> _a_   h_     P(-STOP|h,L,    adj) * P(a|h,L)
 135      h_ -> _a_   h_     P(-STOP|h,L,non_adj) * P(a|h,L)
 136      h  ->  h   _a_     P(-STOP|h,R,    adj) * P(a|h,R)
 137      h  ->  h   _a_     P(-STOP|h,R,non_adj) * P(a|h,R)
 138     '''
 139     def p(self, LRattach, RLattach, *arg):
 140         '''Returns the correct probability, adjacent or non-adjacent,
 141         depending on whether or not there is a some lower attachment
 142         either on the right side of the left child, or the left side
 143         of the right child. '''
 144         if (not LRattach) and (not RLattach):
 145             return self.probA
 146         else:
 147             return self.probN
 148
 149     def bars(self):
 150         return self.LHS()[0]
 151
 152     def head(self):
 153         return self.LHS()[1]
 154
 155     def __init__(self, LHS, L, R, probN, probA):
 156         for b_h in [LHS, L, R]:
 157             if b_h[0] not in BARS:
 158                 raise ValueError("bars must be in %s; was given: %s" % (BARS, b_h[0]))
 159         io.CNF_Rule.__init__(self, LHS, L, R, probN)
 160         self.probA = probA # adjacent
 161         self.probN = probN # non_adj
 162
 163     @classmethod # so we can call DMV_Rule.bar_str(b_h)
 164     def bar_str(cls, b_h):
 165         str = " %d  " % b_h[1]
 166         if(b_h[0] == RBAR):
 167             str = " %d_ " % b_h[1]
 168         if(b_h[0] == LRBAR):
 169             str = "_%d_ " % b_h[1]
 170         if(b_h == ROOT):
 171             str = 'ROOT'
 172         if(b_h == STOP):
 173             str = 'STOP'
 174         return str
 175
 176     def __str__(self):
 177         return "%s-->%s %s\t[%.2f] [%.2f]" % (self.bar_str(self.LHS()),
 178                                                   self.bar_str(self.L()),
 179                                                   self.bar_str(self.R()),
 180                                                   self.probN,
 181                                                   self.probA)
 182
 183
 184
 185
 186
 187
 188
 189 ###################################
 190 # dmv-specific version of inner() #
 191 ###################################
 192 def rewrite_adj(bars, Lattach, Rattach):
 193     # todo: make prettier? Although since we call this so many times,
 194     # having it spelled out here is probably faster
 195     if bars == NOBAR and not Lattach and Rattach:
 196         return ( (Lattach, False, False, False),
 197                  (Lattach, False, False,  True),
 198                  (Lattach, False, True,  False),
 199                  (Lattach, False, True,   True),
 200                  (Lattach,  True, False, False),
 201                  (Lattach,  True, False,  True),
 202                  (Lattach,  True, True,  False),
 203                  (Lattach,  True, True,   True), )
 204     elif bars == RBAR and Lattach:
 205         # Rattach may be either true or false here!
 206         return ( (False, False, False, Rattach),
 207                  (False, False, True,  Rattach),
 208                  (False, True,  False, Rattach),
 209                  (False, True,  True,  Rattach),
 210                  (True,  False, False, Rattach),
 211                  (True,  False, True,  Rattach),
 212                  (True,  True,  False, Rattach),
 213                  (True,  True,  True,  Rattach) )
 214     else:
 215         # NOBAR rewrite rules cannot have Lattach below, and must
 216         # have/add Rattach. RBAR rewrite rules must add Lattach, but
 217         # don't care about Rattach. Returning () should ensure we
 218         # don't add any probability to such "false" situations
 219         return ()
 220
 221 def inner_dmv(s, t, LHS, g, sent, chart):
 222     ''' A rewrite of inner in io.py, to take adjacency into accord.
 223
 224     The chart is now 4 times bigger, since there are different values
 225     for with or without L/R attachments:
 226     chart[(s,t,LHS, Lattach, Rattach)]
 227
 228     If Rattach==True then the rule has a right-attachment or there is
 229     one lower in the tree (meaning we're no longer
 230     adjacent). Adjacency depends on whether there is an attachment
 231     lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
 232
 233     Todo: make this work, then, if possible, refactor (move
 234     dmv-specific stuff back into dmv, so this is "general" again)
 235
 236     Or, if that doesn't work, we might as well make it a method of
 237     DMV_Grammar =P
 238     '''
 239     def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
 240         if io.DEBUG:
 241             attach = {
 242                 (True, True): "left and right attachments below",
 243                 (True, False): "left attachment(s) below",
 244                 (False, True): "right attachment(s) below",
 245                 (False, False): "no attachments below" }
 246             info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
 247             print "%sTrying from  %s_%d  to  %s_%d  with %s, %s:" % info
 248
 249     def O(s):
 250         return sent[s]
 251
 252     def e(s,t,LHS, Lattach, Rattach, n_t):
 253         def tab():
 254             return "\t"*n_t
 255
 256         if (s, t, LHS, Lattach, Rattach) in chart:
 257             return chart[(s, t, LHS, Lattach, Rattach)]
 258         else:
 259             debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
 260             if s == t:
 261                 if Lattach or Rattach:
 262                     # terminals are always F,F for attachment
 263                     io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
 264                     return 0.0
 265                 elif (LHS, O(s)) in g.p_terminals:
 266                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
 267                 else:
 268                     # todo: assuming this is how to deal with lacking
 269                     # rules, since we add prob.s, and 0 is identity
 270                     prob = 0.0
 271                     print "%sLACKING TERMINAL:" % tab()
 272                 # todo: add to chart perhaps? Although, it _is_ simple lookup..
 273                 io.debug( "%s= %.1f (terminal: %s -> %s" % (tab(),prob,LHS,O(s)) )
 274                 return prob
 275             else:
 276                 if (s,t,LHS,Lattach, Rattach) not in chart:
 277                     chart[(s,t,LHS,Lattach,Rattach)] = 0.0
 278                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 279                     io.debug( "%ssumming rule %s" % (tab(),rule) )
 280                     L = rule.L()
 281                     R = rule.R()
 282                     # if it's a STOP rule, rewrite for the same range:
 283                     if (L == STOP) or (R == STOP):
 284                         if L == STOP:
 285                             p = rule.p(Lattach, False) # todo check
 286                             pLR = e(s, t, R, Lattach, Rattach, n_t+1)
 287                         elif R == STOP:
 288                             p = rule.p(False, Rattach) # todo check
 289                             pLR = e(s, t, L, Lattach, Rattach, n_t+1)
 290                         chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
 291
 292                     # not a STOP, an attachment rewrite:
 293                     else:
 294                         for r in range(s, t):
 295                             # LL etc are boolean attachment values
 296                             for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
 297                                 p = rule.p(LR, RL) # probN or probA
 298                                 pL = e(s,   r, L, LL, LR, n_t+1)
 299                                 pR = e(r+1, t, R, RL, RR, n_t+1)
 300                                 chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
 301
 302 #                 io.debug( "\tchart[(s:%d, t:%d, %s ,L:%s, R:%s)] = %.2f" % (s,t,
 303 #                                                                 DMV_Rule.bar_str(LHS),
 304 #                                                                 Lattach,
 305 #                                                                 Rattach,
 306 #                                                                 chart[(s,t,LHS,Lattach,Rattach)]) )
 307                 return chart[(s, t, LHS,Lattach,Rattach)]
 308     # end of e-function
 309
 310     inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
 311     if io.DEBUG:
 312         print "---CHART:---"
 313         for k,v in chart.iteritems():
 314             print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(k[2]),
 315                                                                    O(k[0]), k[0],
 316                                                                    O(k[1]), k[1],
 317                                                                    k[3], k[4], v)
 318         print "---CHART:end---"
 319     return [inner_prob, chart]
 320
 321
 322
 323 if __name__ == "__main__":                      # Non, Adj
 324     _h_ = DMV_Rule((LRBAR,0), STOP,    ( RBAR,0), 1.0, 1.0) # LSTOP
 325     h_S = DMV_Rule(( RBAR,0),(NOBAR,0),  STOP,    0.4, 0.3) # RSTOP
 326     h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
 327     h   = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
 328     b2  = {}
 329     b2[(NOBAR, 0), 'h'] = 1.0
 330     b2[(RBAR, 0), 'h'] = h_S.probA
 331     b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
 332
 333     g2 = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'})
 334
 335     io.DEBUG = 0
 336     test1 = inner_dmv(0, 1, (LRBAR,0), g2, 'h h'.split(), {})
 337     print "Should be 0.183: %.3f" % test1[0]
 338     #ZZZ current
 339
 340 ##############################
 341 # DMV-probabilities, todo:   #
 342 ##############################
 343
 344 def P_STOP(STOP, h, dir, adj, corpus):
 345     '''corpus is a list of sentences s.
 346
 347 This is based on the formula where STOP is True... not sure how we
 348 calculate if STOP is False.
 349
 350
 351 I thought about instead having this:
 352
 353 for rule in g.p_rules:
 354     rule.num = 0
 355     rule.den = 0
 356 for sent in corpus:
 357     for rule in g.p_rules:
 358        for s:
 359            for t:
 360                set num and den using inner
 361 for rule in g.p_rules
 362     rule.prob = rule.num / rule.den
 363
 364 ..the way I'm assuming we do it in the commented out io-function in
 365 io.py. Having sentences as the outer loop at least we can easily just
 366 go through the heads that are actually in the sentence... BUT, this
 367 means having to go through p_rules 3 times, not sure what is slower.
 368
 369 oh, and:
 370 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
 371 =D
 372
 373 '''
 374     P_STOP_num = 0
 375     P_STOP_den = 0
 376     for sent in corpus:
 377         # here we should somehow make each word in the sentence
 378         # unique, decorate them with subscripts or something. We have
 379         # to run through the sentence as many times as h appears
 380         # there. This also means changing inner(), I suspect.  Have to
 381         # make sure we separate reading of inner_prob from changing of
 382         # inner_prob...
 383         for s in range(loc(h)): # i<loc(h), where h is in the sentence.
 384             for t in range(i, len(sent)):
 385                 P_STOP_num += inner(s, t, h-r, g, sent, chart)
 386                 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
 387     return P_STOP_num / P_STOP_den # possibly other way round? todo
 388
 389
 390 def P_CHOOSE():
 391     return "todo"
 392
 393 def DMV(sent, g):
 394     '''Here it seems like they store rule information on a per-head (per
 395     direction) basis, in deps_D(h, dir) which gives us a list. '''
 396     def P_h(h):
 397         P_h = 1 # ?
 398         for dir in ['l', 'r']:
 399             for a in deps(h, dir):
 400                 # D(a)??
 401                 P_h *= \
 402                     P_STOP (0, h, dir, adj) * \
 403                     P_CHOOSE (a, h, dir) * \
 404                     P_h(D(a)) * \
 405                 P_STOP (STOP | h, dir, adj)
 406         return P_h
 407     return P_h(root(sent))
 408
 409
 410
 411 ##############################
 412 # Initialization, todo       #
 413 ##############################
 414 def taglist(corpus):
 415     '''sents is of this form:
 416 [['tag', ...], ['tag2', ...], ...]
 417
 418 Return a list of the tags. (Has to be ordered for enumerating to be
 419 consistent.)
 420
 421 Fortunately only has to run once.
 422 '''
 423     tagset = set()
 424     for sent in corpus:
 425         for tag in sent:
 426             tagset.add(tag)
 427     if 'ROOT' in tagset:
 428         raise ValueError("it seems we must have a new ROOT symbol")
 429     return list(tagset)
 430
 431
 432
 433
 434
 435 def init_zeros(tags):
 436     "Return a frequency dictionary with DMV-relevant keys set to 0 / {}."
 437     f = {}
 438     for tag in tags:
 439         f['ROOT', tag] = 0
 440         f['sum', 'ROOT'] = 0
 441         for dir_adj in ['LN','LA','RN','RA']:
 442             f[tag, 'STOP', dir_adj] = 0
 443             f[tag, '-STOP', dir_adj] = 0
 444         f[tag, 'R'] = {}
 445         f[tag, 'L'] = {}
 446         f[tag, 'sum', 'R'] = 0.0
 447         f[tag, 'sum', 'L'] = 0.0
 448     return f
 449
 450 def init_freq(corpus, tags):
 451     '''Returns f, a dictionary with these types of keys:
 452     - ('ROOT', tag) is basically just the frequency of tag
 453     - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
 454       etc. for 'RN', 'LA', 'LN', '-STOP'.
 455     - (tag, 'L') is a dictionary of arg:f, where head could take arg
 456       to direction 'L' (etc. for 'R') and f is "harmonically" divided
 457       by distance, used for finding P_CHOOSE
 458
 459       Does this stuff:
 460       1. counts word frequencies for f_ROOT
 461       2. adds to certain f_STOP counters if a word is found first,
 462          last, first or second, or last or second to last in the sentence
 463          (Left Adjacent, Left Non-Adjacent, etc)
 464       3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
 465          distance between arg and head)
 466     '''
 467     f = init_zeros(tags)
 468
 469     for sent in corpus: # sent is ['VBD', 'NN', ...]
 470         n = len(sent)
 471         # NOTE: head in DMV_Rule is a number, while this is the string
 472         for i_h, head in enumerate(sent):
 473             # todo grok: how is this different from just using straight head
 474             # frequency counts, for the ROOT probabilities?
 475             f['ROOT', head] += 1
 476             f['sum', 'ROOT'] += 1
 477
 478             # True = 1, False = 0. todo: make prettier
 479             f[head,  'STOP', 'LN'] += (i_h <= 1)     # first two words
 480             f[head, '-STOP', 'LN'] += (not i_h <= 1)
 481             f[head,  'STOP', 'LA'] += (i_h == 0)     # very first word
 482             f[head, '-STOP', 'LA'] += (not i_h == 0)
 483             f[head,  'STOP', 'RN'] += (i_h >= n - 2) # last two words
 484             f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
 485             f[head,  'STOP', 'RA'] += (i_h == n - 1) # very last word
 486             f[head, '-STOP', 'RA'] += (not i_h == n - 1)
 487
 488             # this is where we make the "harmonic" distribution. quite.
 489             for i_a, arg in enumerate(sent):
 490                 if i_h != i_a:
 491                     harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
 492                     if i_h > i_a:
 493                         dir = 'L'
 494                     else:
 495                         dir = 'R'
 496                     if arg not in f[head, dir]:
 497                         f[head, dir][arg] = 0.0
 498                     f[head, dir][arg] += harmony
 499                     f[head, 'sum', dir] += harmony
 500                 # todo, optimization: possible to do both directions
 501                 # at once here, and later on rule out the ones we've
 502                 # done? does it actually speed things up?
 503
 504     return f
 505
 506 def init_normalize(f, tags, tagnum, numtag):
 507     '''Use frequencies (and sums) in f to return create p_STOP and
 508     p_CHOOSE; at the same time adding the context-free rules to the
 509     grammar using these probabilities.
 510
 511     Return a usable grammar.'''
 512     p_rules = []
 513     p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
 514     for n_h, head in enumerate(tags):
 515         p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
 516         p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
 517                                  p_ROOT[n_h],
 518                                  p_ROOT[n_h]))
 519
 520         # p_STOP = STOP / (STOP + NOT_STOP)
 521         for dir in ['L','R']:
 522             for adj in ['N','A']:
 523                 p_STOP[n_h, dir+adj] = \
 524                     float(f[head, 'STOP', dir+adj]) / \
 525                     (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
 526             # make rule using the previously found probN and probA:
 527             p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
 528                                      p_STOP[n_h, dir+'N'],
 529                                      p_STOP[n_h, dir+'A']) )
 530
 531         # inner() shouldn't have to deal with those long non-branching stops:
 532         p_terminals[(NOBAR, n_h), head] = 1.0
 533         p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
 534         p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
 535
 536         for dir in ['L', 'R']:
 537             for arg, val in f[head, dir].iteritems():
 538                 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
 539
 540     # after the head tag-loop, add every head-argument rule:
 541     for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
 542         if dir == 'L': # arg is to the left of head
 543             p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
 544                                      p_C*(1-p_STOP[n_h, dir+'N']),
 545                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 546         if dir == 'R':
 547             p_rules.append( DMV_Rule((NOBAR,n_h), (LRBAR,n_a), (NOBAR,n_h),
 548                                      p_C*(1-p_STOP[n_h, dir+'N']),
 549                                      p_C*(1-p_STOP[n_h, dir+'A'])) )
 550
 551     return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag)
 552
 553
 554 def initialize(corpus):
 555     '''Return an initialized DMV_Grammar
 556     corpus is a list of lists of tags.'''
 557     tags = taglist(corpus)
 558     tagnum, numtag = {}, {}
 559     for num, tag in enumerate(tags):
 560         tagnum[tag] = num
 561         numtag[num] = tag
 562     # f: frequency counts used in initialization, mostly distances
 563     f = init_freq(corpus, tags)
 564     g = init_normalize(f, tags, tagnum, numtag)
 565     return g
 566
 567
 568 if __name__ == "__main__":
 569 #     print "--------initialization------------"
 570 #     print initialize([['foo', 'two','foo','foo'],
 571 #                       ['zero', 'one','two','three']])
 572     pass
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585 # todo: some testing on the Brown corpus:
 586
 587 if __name__ == "__main__":
 588     # first five sentences of the Brown corpus:
 589     g_brown = initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
 590     # 36:'AT' in g_brown.numtag, 40:'NP-TL'
 591     test_brown = inner_dmv(0, 1, (LRBAR,36), g_brown, ['AT', 'NP-TL'], {})
 592     print "Brown-test gives: %.3f" % test_brown[0]
 593     print "todo"
 594
 595     # this will give the tag sequences of all the 6218 Brown corpus
 596     # sentences of length < 7:
 597     # [[tag for (w, tag) in sent]
 598     #  for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
 599
 600
 601 def tagset_brown():
 602     "472 tags, takes a while to extract with tagset(), hardcoded here."
 603     return set(['BEDZ-NC', 'NP$', 'AT-TL', 'CS', 'NP+HVZ', 'IN-TL-HL', 'NR-HL', 'CC-TL-HL', 'NNS$-HL', 'JJS-HL', 'JJ-HL', 'WRB-TL', 'JJT-TL', 'WRB', 'DOD*', 'BER*-NC', ')-HL', 'NPS$-HL', 'RB-HL', 'FW-PPSS', 'NP+HVZ-NC', 'NNS$', '--', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'PPSS+MD', 'NPS', 'RBR+CS', 'DTI', 'NPS-TL', 'BEM', 'FW-AT+NP-TL', 'EX+BEZ', 'BEG', 'BED', 'BEZ', 'DTX', 'DOD*-TL', 'FW-VB-NC', 'DTS', 'DTS+BEZ', 'QL-HL', 'NP$-TL', 'WRB+DOD*', 'JJR+CS', 'NN+MD', 'NN-TL-HL', 'HVD-HL', 'NP+BEZ-NC', 'VBN+TO', '*-TL', 'WDT-HL', 'MD', 'NN-HL', 'FW-BE', 'DT$', 'PN-TL', 'DT-HL', 'FW-NR-TL', 'VBG', 'VBD', 'VBN', 'DOD', 'FW-VBG-TL', 'DOZ', 'ABN-TL', 'VB+JJ-NC', 'VBZ', 'RB+CS', 'FW-PN', 'CS-NC', 'VBG-NC', 'BER-HL', 'MD*', '``', 'WPS-TL', 'OD-TL', 'PPSS-HL', 'PPS+MD', 'DO*', 'DO-HL', 'HVG-HL', 'WRB-HL', 'JJT', 'JJS', 'JJR', 'HV+TO', 'WQL', 'DOD-NC', 'CC-HL', 'FW-PPSS+HV', 'FW-NP-TL', 'MD+TO', 'VB+IN', 'JJT-NC', 'WDT+BEZ-TL', '---HL', 'PN$', 'VB+PPO', 'BE-TL', 'VBG-TL', 'NP$-HL', 'VBZ-TL', 'UH', 'FW-WPO', 'AP+AP-NC', 'FW-IN', 'NRS-TL', 'ABL', 'ABN', 'TO-TL', 'ABX', '*-HL', 'FW-WPS', 'VB-NC', 'HVD*', 'PPS+HVD', 'FW-IN+AT', 'FW-NP', 'QLP', 'FW-NR', 'FW-NN', 'PPS+HVZ', 'NNS-NC', 'DT+BEZ-NC', 'PPO', 'PPO-NC', 'EX-HL', 'AP$', 'OD-NC', 'RP', 'WPS+BEZ', 'NN+BEZ', '.-TL', ',', 'FW-DT+BEZ', 'RB', 'FW-PP$-NC', 'RN', 'JJ$-TL', 'MD-NC', 'VBD-NC', 'PPSS+BER-N', 'RB+BEZ-NC', 'WPS-HL', 'VBN-NC', 'BEZ-HL', 'PPL-NC', 'BER-TL', 'PP$$', 'NNS+MD', 'PPS-NC', 'FW-UH-NC', 'PPS+BEZ-NC', 'PPSS+BER-TL', 'NR-NC', 'FW-JJ', 'PPS+BEZ-HL', 'NPS$', 'RB-TL', 'VB-TL', 'BEM*', 'MD*-HL', 'FW-CC', 'NP+MD', 'EX+HVZ', 'FW-CD', 'EX+HVD', 'IN-HL', 'FW-CS', 'JJR-HL', 'FW-IN+NP-TL', 'JJ-TL-HL', 'FW-UH', 'EX', 'FW-NNS-NC', 'FW-JJ-NC', 'VBZ-HL', 'VB+RP', 'BEZ-NC', 'PPSS+HV-TL', 'HV*', 'IN', 'PP$-NC', 'NP-NC', 'BEN', 'PP$-TL', 'FW-*-TL', 'FW-OD-TL', 'WPS', 'WPO', 'MD+PPSS', 'WDT+BER', 'WDT+BEZ', 'CD-HL', 'WDT+BEZ-NC', 'WP$', 'DO+PPSS', 'HV-HL', 'DT-NC', 'PN-NC', 'FW-VBZ', 'HVD', 'HVG', 'NN+BEZ-TL', 'HVZ', 'FW-VBD', 'FW-VBG', 'NNS$-TL', 'JJ-TL', 'FW-VBN', 'MD-TL', 'WDT+DOD', 'HV-TL', 'NN-TL', 'PPSS', 'NR$', 'BER', 'FW-VB', 'DT', 'PN+BEZ', 'VBG-HL', 'FW-PPL+VBZ', 'FW-NPS-TL', 'RB$', 'FW-IN+NN', 'FW-CC-TL', 'RBT', 'RBR', 'PPS-TL', 'PPSS+HV', 'JJS-TL', 'NPS-HL', 'WPS+BEZ-TL', 'NNS-TL-HL', 'VBN-TL-NC', 'QL-TL', 'NN+NN-NC', 'JJR-TL', 'NN$-TL', 'FW-QL', 'IN-TL', 'BED-NC', 'NRS', '.-HL', 'QL', 'PP$-HL', 'WRB+BER', 'JJ', 'WRB+BEZ', 'NNS$-TL-HL', 'PPSS+BEZ', '(', 'PPSS+BER', 'DT+MD', 'DOZ-TL', 'PPSS+BEM', 'FW-PP$', 'RB+BEZ-HL', 'FW-RB+CC', 'FW-PPS', 'VBG+TO', 'DO*-HL', 'NR+MD', 'PPLS', 'IN+IN', 'BEZ*', 'FW-PPL', 'FW-PPO', 'NNS-HL', 'NIL', 'HVN', 'PPSS+BER-NC', 'AP-TL', 'FW-DT', '(-HL', 'DTI-TL', 'JJ+JJ-NC', 'FW-RB', 'FW-VBD-TL', 'BER-NC', 'NNS$-NC', 'JJ-NC', 'NPS$-TL', 'VB+VB-NC', 'PN', 'VB+TO', 'AT-TL-HL', 'BEM-NC', 'PPL-TL', 'ABN-HL', 'RB-NC', 'DO-NC', 'BE-HL', 'WRB+IN', 'FW-UH-TL', 'PPO-HL', 'FW-CD-TL', 'TO-HL', 'PPS+BEZ', 'CD$', 'DO', 'EX+MD', 'HVZ-TL', 'TO-NC', 'IN-NC', '.', 'WRB+DO', 'CD-NC', 'FW-PPO+IN', 'FW-NN$-TL', 'WDT+BEZ-HL', 'RP-HL', 'CC', 'NN+HVZ-TL', 'FW-NNS-TL', 'DT+BEZ', 'WPS+HVZ', 'BEDZ*', 'NP-TL', ':-TL', 'NN-NC', 'WPO-TL', 'QL-NC', 'FW-AT+NN-TL', 'WDT+HVZ', '.-NC', 'FW-DTS', 'NP-HL', ':-HL', 'RBR-NC', 'OD-HL', 'BEDZ-HL', 'VBD-TL', 'NPS-NC', ')', 'TO+VB', 'FW-IN+NN-TL', 'PPL', 'PPS', 'PPSS+VB', 'DT-TL', 'RP-NC', 'VB', 'FW-VB-TL', 'PP$', 'VBD-HL', 'DTI-HL', 'NN-TL-NC', 'PPL-HL', 'DOZ*', 'NR-TL', 'WRB+MD', 'PN+HVZ', 'FW-IN-TL', 'PN+HVD', 'BEN-TL', 'BE', 'WDT', 'WPS+HVD', 'DO-TL', 'FW-NN-NC', 'WRB+BEZ-TL', 'UH-TL', 'JJR-NC', 'NNS', 'PPSS-NC', 'WPS+BEZ-NC', ',-TL', 'NN$', 'VBN-TL-HL', 'WDT-NC', 'OD', 'FW-OD-NC', 'DOZ*-TL', 'PPSS+HVD', 'CS-TL', 'WRB+DOZ', 'CC-NC', 'HV', 'NN$-HL', 'FW-WDT', 'WRB+DOD', 'NN+HVZ', 'AT-NC', 'NNS-TL', 'FW-BEZ', 'CS-HL', 'WPO-NC', 'FW-BER', 'NNS-TL-NC', 'BEZ-TL', 'FW-IN+AT-T', 'ABN-NC', 'NR-TL-HL', 'BEDZ', 'NP+BEZ', 'FW-AT-TL', 'BER*', 'WPS+MD', 'MD-HL', 'BED*', 'HV-NC', 'WPS-NC', 'VBN-HL', 'FW-TO+VB', 'PPSS+MD-NC', 'HVZ*', 'PPS-HL', 'WRB-NC', 'VBN-TL', 'CD-TL-HL', ',-NC', 'RP-TL', 'AP-HL', 'FW-HV', 'WQL-TL', 'FW-AT', 'NN', 'NR$-TL', 'VBZ-NC', '*', 'PPSS-TL', 'JJT-HL', 'FW-NNS', 'NP', 'UH-HL', 'NR', ':', 'FW-NN$', 'RP+IN', ',-HL', 'JJ-TL-NC', 'AP-NC', '*-NC', 'VB-HL', 'HVZ-NC', 'DTS-HL', 'FW-JJT', 'FW-JJR', 'FW-JJ-TL', 'FW-*', 'RB+BEZ', "''", 'VB+AT', 'PN-HL', 'PPO-TL', 'CD-TL', 'UH-NC', 'FW-NN-TL-NC', 'EX-NC', 'PPSS+BEZ*', 'TO', 'WDT+DO+PPS', 'IN+PPO', 'AP', 'AT', 'DOZ-HL', 'FW-RB-TL', 'CD', 'NN+IN', 'FW-AT-HL', 'PN+MD', "'", 'FW-PP$-TL', 'FW-NPS', 'WDT+BER+PP', 'NN+HVD-TL', 'MD+HV', 'AT-HL', 'FW-IN+AT-TL'])
 604
 605
 606
 607