tex/DMVCCM.org.~1~

   1 # -*- coding: mule-utf-8-unix -*-
   2
   3 #+STARTUP: overview
   4 #+TAGS: OPTIMIZE PRETTIER
   5 #+STARTUP: hidestars
   6 #+TITLE: DMV/CCM -- todo-list / progress
   7 #+AUTHOR: Kevin Brubeck Unhammer
   8 #+EMAIL: K.BrubeckUnhammer at student uva nl
   9 #+OPTIONS: H:4
  10 #+OPTIONS: toc:3
  11 #+OPTIONS: ^:{}
  12 #+LANGUAGE: en
  13 #+SEQ_TODO: TOGROK TODO DONE
  14
  15 * dmvccm report and project
  16   DEADLINE: <2008-06-30 Mon>
  17 - [[file:src/dmv.py][dmv.py]]
  18 - [[file:src/io.py][io.py]]
  19 - [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
  20
  21 (But absolute, extended, really-quite-dead-now deadline: August 31...)
  22 * DONE outer probabilities
  23   CLOSED: [2008-06-12 Thu 11:11]
  24 # <<outer>>
  25 There are 6 different configurations here, based on what the above
  26 (mother) node is, and what the Node for which we're computing is.
  27 Here *r* is not between *s* and *t* as in inner(), but an /outer/ index. *loc_N*
  28 is the location of the Node head in the sentence, *loc_m* for the head
  29 of the mother of Node.
  30
  31 + mother is a RIGHT-stop:
  32   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  33   - adjacent iff *t* == *loc_m*
  34 + mother is a  LEFT-stop:
  35   - outer(*s, t*, mom.LHS, *loc_N*), no inner-call
  36   - adjacent iff *s* == *loc_m*
  37
  38 + Node is on the LEFT branch (mom.L == Node)
  39   * and mother is a LEFT attachment:
  40     - *loc_N* will be in the LEFT branch, can be anything here.
  41     - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_m*) for
  42       all possible *loc_m* in the right part of the sentence.
  43     - outer(*s, r*, mom.LHS, *loc_m*).
  44     - adjacent iff *t+1* == *loc_m*
  45   * and mother is a RIGHT attachment:
  46     - *loc_m* = *loc_N*.
  47     - In the RIGHT, attached, branch we find inner(*t+1, r*, mom.R, *loc_R*) for
  48       all possible *loc_R* in the right part of the sentence.
  49     - outer(*s, r*, mom.LHS, *loc_N*).
  50     - adjacent iff *t* == *loc_m*
  51
  52 + Node is on the RIGHT branch (mom.R == Node)
  53   * and mother is a LEFT attachment:
  54     - *loc_m* = *loc_N*.
  55     - In the LEFT, attached, branch we find inner(*r, s-1*, mom.L, *loc_L*) for
  56       all possible *loc_L* in the left part of the sentence.
  57     - outer(*r, t*, mom.LHS, *loc_m*).
  58     - adjacent iff *s* == *loc_m*
  59   * and mother is a RIGHT attachment:
  60     - *loc_N* will be in the RIGHT branch, can be anything here.
  61     - In the RIGHT, attached, branch we find inner(*r, s-1*, mom.L, *loc_m*) for
  62       all possible *loc_m* in the right part of the sentence.
  63     - outer(*r, t*, mom.LHS, *loc_N*).
  64     - adjacent iff *s-1* == *loc_m*
  65
  66 [[file:outer_attachments.jpg]]
  67
  68 Also, unlike in [[http://bibsonomy.org/bibtex/2b9f6798bb092697da7042ca3f5dee795][Lari & Young]], non-ROOT ('S') symbols may cover the
  69 whole sentence, but ROOT may /only/ appear if it covers the whole
  70 sentence.
  71 ** COMMENT qtrees
  72
  73 \Tree [.{$h\urcorner$} [.{$_s \ulcorner a\urcorner _t$\\
  74   Node} ] {$_{t+1} h\urcorner _r$\\
  75   R} ]
  76 \Tree [.{$h$} {$_{s} h _{t}$\\
  77   Node} [.{$_{t+1} \ulcorner a\urcorner _r$\\
  78   R} ] ]
  79 \Tree [.{$h\urcorner$} [.{$_r \ulcorner a\urcorner _{s-1}$\\
  80   L} ] {$_{s} h\urcorner _t$\\
  81   Node} ]
  82 \Tree [.{$h$} {$_{r} h _{s-1}$\\
  83   L} [.{$_{s} \ulcorner a\urcorner _t$\\
  84   Node} ] ]
  85 * TODO [#B] P_STOP and P_CHOOSE for IO/EM (reestimation)
  86 [[file:src/dmv.py::DMV%20probabilities][dmv-P_STOP]]
  87 Remember: The P_{STOP} formula is upside-down (left-to-right also).
  88 (In the article..not the [[http://www.eecs.berkeley.edu/~klein/papers/klein_thesis.pdf][thesis]])
  89
  90 ** P_STOP formulas for various dir and adj:
  91 Assuming this:
  92 - P_{STOP}(STOP|h,L,non_adj) = \sum_{corpus} \sum_{s<loc(h)} \sum_{t}
  93   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s<loc(h)} \sum_{t}
  94   inner(s,t,h_, ...)
  95 - P_{STOP}(STOP|h,L,adj) = \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
  96   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
  97   inner(s,t,h_, ...)
  98 - P_{STOP}(STOP|h,R,non_adj) = \sum_{corpus} \sum_{s} \sum_{t>loc(h)}
  99   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t>loc(h)}
 100   inner(s,t,h_, ...)
 101 - P_{STOP}(STOP|h,R,adj) = \sum_{corpus} \sum_{s} \sum_{t=loc(h)}
 102   inner(s,t,_h_, ...) / \sum_{corpus} \sum_{s} \sum_{t=loc(h)}
 103   inner(s,t,h_, ...)
 104
 105 (And P_{STOP}(-STOP|...) = 1 - P_{STOP}(STOP|...) )
 106 ** P_CHOOSE formula:
 107 Assuming this:
 108 - P_{CHOOSE}(a|h,L) = \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)} \sum_{r<t}
 109   inner(s,r,_a_, ...) / \sum_{corpus} \sum_{s<loc(h)} \sum_{t>=loc(h)}
 110   inner(s,t,h_,...)
 111   - t >= loc(h) since there are many possibilites for
 112     right-attachments below, and each of them alone gives a lower
 113     probability (through multiplication) to the upper tree (so sum
 114     them all)
 115 - P_{CHOOSE}(a|h,R) = \sum_{corpus} \sum_{s=loc(h)} \sum_{r>s}
 116   inner(s,r,_a_,...) / \sum_{corpus} \sum_{s=loc(h)} \sum_{t}
 117   inner(s,t,h, ...)
 118 ** DONE [#A] How do we only count from completed trees?
 119    CLOSED: [2008-06-13 Fri 11:40]
 120 Use c(s,t,Node); inner * outer / P_sent
 121
 122 ** DONE [#A] c(s,t,Node)
 123   CLOSED: [2008-06-13 Fri 11:38]
 124 = inner * outer / P_sent
 125
 126 implemented as inner * outer / inner_sent
 127
 128 * TOGROK Find Most Probable Parse of given test sentence
 129 We could probably do it pretty easily from the chart:
 130 - for all loc_h, select the (0,len(sent),ROOT,loc_h) that has the highest p
 131 - for stops, just keep going
 132 - for rewrites, for loc_dep select the one that has the highest p
 133 * TOGROK Combine CCM with DMV
 134 * TODO Get a dependency parsed version of WSJ to test with
 135 * Initialization
 136 [[file:~/Documents/Skole/V08/Probability/dmvccm/src/dmv.py::Initialization%20todo][dmv-inits]]
 137
 138 We go through the corpus, since the probabilities are based on how far
 139 away in the sentence arguments are from their heads.
 140 ** TOGROK CCM Initialization
 141 P_{SPLIT} used here... how, again?
 142 ** DONE Separate initialization to another file?                      :PRETTIER:
 143    CLOSED: [2008-06-08 Sun 12:51]
 144 [[file:src/harmonic.py::harmonic%20py%20initialization%20for%20dmv][harmonic.py]]
 145 ** DONE DMV Initialization probabilities
 146 (from initialization frequency)
 147 ** DONE DMV Initialization frequencies
 148    CLOSED: [2008-05-27 Tue 20:04]
 149 *** P_STOP
 150 P_{STOP} is not well defined by K&M. One possible interpretation given
 151 the sentence [det nn vb nn] is
 152 : f_{STOP}( STOP|det, L, adj) +1
 153 : f_{STOP}(-STOP|det, L, adj) +0
 154 : f_{STOP}( STOP|det, L, non_adj) +1
 155 : f_{STOP}(-STOP|det, L, non_adj) +0
 156 : f_{STOP}( STOP|det, R, adj) +0
 157 : f_{STOP}(-STOP|det, R, adj) +1
 158 :
 159 : f_{STOP}( STOP|nn, L, adj) +0
 160 : f_{STOP}(-STOP|nn, L, adj) +1
 161 : f_{STOP}( STOP|nn, L, non_adj) +1  # since there's at least one to the left
 162 : f_{STOP}(-STOP|nn, L, non_adj) +0
 163 **** TODO tweak
 164 # <<pstoptweak>>
 165 :            f[head,  'STOP', 'LN'] += (i_h <= 1)     # first two words
 166 :            f[head, '-STOP', 'LN'] += (not i_h <= 1)
 167 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # very first word
 168 :            f[head, '-STOP', 'LA'] += (not i_h == 0)
 169 :            f[head,  'STOP', 'RN'] += (i_h >= n - 2) # last two words
 170 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
 171 :            f[head,  'STOP', 'RA'] += (i_h == n - 1) # very last word
 172 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1)
 173 vs
 174 :            # this one requires some additional rewriting since it
 175 :            # introduces divisions by zero
 176 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 177 :            f[head, '-STOP', 'LN'] += (not i_h <= 1) # not first two
 178 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 179 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 180 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 181 :            f[head, '-STOP', 'RN'] += (not i_h >= n - 2) # not last two
 182 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 183 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 184 vs
 185 :            f[head,  'STOP', 'LN'] += (i_h == 1)     # second word
 186 :            f[head, '-STOP', 'LN'] += (not i_h == 1) # not second
 187 :            f[head,  'STOP', 'LA'] += (i_h == 0)     # first word
 188 :            f[head, '-STOP', 'LA'] += (not i_h == 0) # not first
 189 :            f[head,  'STOP', 'RN'] += (i_h == n - 2)     # second-to-last
 190 :            f[head, '-STOP', 'RN'] += (not i_h == n - 2) # not second-to-last
 191 :            f[head,  'STOP', 'RA'] += (i_h == n - 1)     # last word
 192 :            f[head, '-STOP', 'RA'] += (not i_h == n - 1) # not last
 193 vs
 194 "all words take the same number of arguments" interpreted as
 195 :for all heads:
 196 :    p_STOP(head, 'STOP', 'LN') = 0.3
 197 :    p_STOP(head, 'STOP', 'LA') = 0.5
 198 :    p_STOP(head, 'STOP', 'RN') = 0.4
 199 :    p_STOP(head, 'STOP', 'RA') = 0.7
 200 (which we easily may tweak in init_zeros())
 201 *** P_CHOOSE
 202 Go through the corpus, counting distances between heads and
 203 arguments. In [det nn vb nn], we give
 204 - f_{CHOOSE}(nn|det, R) +1/1 + C
 205 - f_{CHOOSE}(vb|det, R) +1/2 + C
 206 - f_{CHOOSE}(nn|det, R) +1/3 + C
 207   - If this were the full corpus, P_{CHOOSE}(nn|det, R) would have
 208     (1+1/3+2C) / sum_a f_{CHOOSE}(a|det, R)
 209
 210 The ROOT gets "each argument with equal probability", so in a sentence
 211 of three words, 1/3 for each (in [nn vb nn], 'nn' gets 2/3). Basically
 212 just a frequency count of the corpus...
 213
 214 In a sense there are no terminal probabilities, since an /h/ can only
 215 rewrite to an 'h' anyway.
 216 * [#C] Deferred
 217 http://wiki.python.org/moin/PythonSpeed/PerformanceTips Eg., use
 218 map/reduce/filter/[i for i in [i's]]/(i for i in [i's]) instead of
 219 for-loops; use local variables for globals (global variables or or
 220 functions), etc.
 221
 222 ** TODO when reestimating P_STOP etc, remove rules with p < epsilon   :OPTIMIZE:
 223 ** TODO inner_dmv, short ranges and impossible attachment             :OPTIMIZE:
 224 If s-t <= 2, there can be only one attachment below, so don't recurse
 225 with both Lattach=True and Rattach=True.
 226
 227 If s-t <= 1, there can be no attachment below, so only recurse with
 228 Lattach=False, Rattach=False.
 229
 230 Put this in the loop under rewrite rules (could also do it in the STOP
 231 section, but that would only have an effect on very short sentences).
 232 ** TODO clean up the module files                                     :PRETTIER:
 233 Is there better way to divide dmv and harmonic? There's a two-way
 234 dependency between the modules. Guess there could be a third file that
 235 imports both the initialization and the actual EM stuff, while a file
 236 containing constants and classes could be imported by all others:
 237 : dmv.py imports dmv_EM.py imports dmv_classes.py
 238 : dmv.py imports dmv_inits.py imports dmv_classes.py
 239
 240 ** TOGROK Some (tagged) sentences are bound to come twice             :OPTIMIZE:
 241 Eg, first sort and count, so that the corpus
 242 [['nn','vbd','det','nn'],
 243  ['vbd','nn','det','nn'],
 244  ['nn','vbd','det','nn']]
 245 becomes
 246 [(['nn','vbd','det','nn'],2),
 247  (['vbd','nn','det','nn'],1)]
 248 and then in each loop through sentences, make sure we handle the
 249 frequency correctly.
 250
 251 Is there much to gain here?
 252
 253 ** TOGROK tags as numbers or tags as strings?                         :OPTIMIZE:
 254 Need to clean up the representation.
 255
 256 Stick with tag-strings in initialization then switch to numbers for
 257 IO-algorithm perhaps? Can probably afford more string-matching in
 258 initialization..
 259 ** DONE if loc_h == t, no need to try right-attachment rules &v.v.    :OPTIMIZE:
 260    CLOSED: [2008-06-10 Tue 14:34]
 261 (and if loc_h == s, no need to try left-attachment rules.)
 262
 263 Modest speed increase (5%).
 264 ** DONE io.debug parameters should not call functions                 :OPTIMIZE:
 265    CLOSED: [2008-06-10 Tue 12:26]
 266 Exchanged all io.debug(str,'level') calls with statements of the form:
 267 :if 'level' in io.DEBUG:
 268 :    print str
 269
 270 and got an almost threefold speed increase on inner().
 271 ** DONE inner_dmv() should disregard rules with heads not in sent     :OPTIMIZE:
 272    CLOSED: [2008-06-08 Sun 10:18]
 273 If the sentence is "nn vbd det nn", we should not even look at rules
 274 where
 275 : rule.head() not in "nn vbd det nn".split()
 276 This is ruled out by getting rules from g.rules(LHS, sent).
 277
 278 Also, we optimize this further by saying we don't even recurse into
 279 attachment rules where
 280 : rule.head() not in sent[ s :r+1]
 281 : rule.head() not in sent[r+1:t+1]
 282 meaning, if we're looking at the span "vbd det", we only use
 283 attachment rules where both daughters are members of ['vbd','det']
 284 (although we don't (yet) care about removing rules that rewrite to the
 285 same tag if there are no duplicate tags in the span, etc., that would
 286 be a lot of trouble for little potential gain).
 287 * Adjacency and combining it with the inside-outside algorithm
 288 Each DMV_Rule has both a probN and a probA, for adjacencies. inner()
 289 and outer() needs the correct one in each case.
 290
 291 In each inner() call, loc_h is the location of the head of this
 292 dependency structure. In each outer() call, it's the head of the /Node/,
 293 the structure we're looking outside of.
 294
 295 We call inner() for each location of a head, and on each terminal,
 296 loc_h must equal s (and t). In the recursive attachment calls, we use
 297 the locations (sentence indices) of words to the left or right of the
 298 head in calls to inner(). /loc_h lets us check whether we need probN or
 299 probA/.
 300 ** Possible alternate type of adjacency
 301 K&M's adjacency is just whether or not an argument has been generated
 302 in the current direction yet. One could also make a stronger type of
 303 adjacency, where h and a are not adjacent if b is in between, eg. with
 304 the sentence "a b h" and the structure ((h->a), (a->b)), h is
 305 K&M-adjacent to a, but not next to a, since b is in between. It's easy
 306 to check this type of adjacency in inner(), but it needs new rules for
 307 P_STOP reestimation.
 308 * Expectation Maximation in IO/DMV-terms
 309 outer(s,t,Node) and inner(s,t,Node) calculates the expected number of
 310 trees (CNF-)headed by Node from *s* to *t* (sentence positions). This uses
 311 the P_STOP and P_CHOOSE values, which have been conveniently
 312 distributed into CNF rules as probN and probA (non-adjacent and
 313 adjacent probabilites).
 314
 315 When re-estimating, we use the expected values from outer() and
 316 inner() to get new values for P_STOP and P_CHOOSE. When we've
 317 re-estimated for the entire corpus, we distribute P_STOP and P_CHOOSE
 318 into the CNF rules again, so that in the next round we use new probN
 319 and probA to find outer- and inner-probabilites.
 320
 321 The distribution of P_STOP and P_CHOOSE into CNF rules also happens in
 322 init_normalize() (here along with the creation of P_STOP and
 323 P_CHOOSE); P_STOP is used to create CNF rules where one branch of the
 324 rule is STOP, P_CHOOSE is used to create rules of the form
 325 : h  -> h  _a_
 326 : h_ -> h_ _a_
 327
 328 Since "adjacency" is not captured in regular CNF rules, we need two
 329 probabilites for each rule, and outer() and inner() have to know when
 330 to use which.
 331
 332
 333 * Python-stuff
 334 # <<python>>
 335 Make those debug statements steal a bit less attention in emacs:
 336 :(font-lock-add-keywords
 337 : 'python-mode                   ; not really regexp, a bit slow
 338 : '(("^\\( *\\)\\(\\if +'.+' +in +io.DEBUG. *\\(
 339 :\\1    .+$\\)+\\)" 2 font-lock-preprocessor-face t)))
 340 :(font-lock-add-keywords
 341 : 'python-mode
 342 : '(("\\<\\(\\(io\\.\\)?debug(.+)\\)" 1 font-lock-preprocessor-face t)))
 343
 344 - [[file:src/pseudo.py][pseudo.py]]
 345 - http://nltk.org/doc/en/structured-programming.html recursive dynamic
 346 - http://nltk.org/doc/en/advanced-parsing.html
 347 - http://jaynes.colorado.edu/PythonIdioms.html
 348
 349
 350
 351 * Git
 352 Repository web page: http://repo.or.cz/w/dmvccm.git
 353
 354 Setting up a new project:
 355 : git init
 356 : git add .
 357 : git commit -m "first release"
 358
 359 Later on: (=-a= does =git rm= and =git add= automatically)
 360 : git init
 361 : git commit -a -m "some subsequent release"
 362
 363 Then push stuff up to the remote server:
 364 : git push git+ssh://username@repo.or.cz/srv/git/dmvccm.git master
 365
 366 (=eval `ssh-agent`= and =ssh-add= to avoid having to type in keyphrase all
 367 the time)
 368
 369 Make a copy of the (remote) master branch:
 370 : git clone git://repo.or.cz/dmvccm.git
 371
 372 Make and name a new branch in this folder
 373 : git checkout -b mybranch
 374
 375 To save changes in =mybranch=:
 376 : git commit -a
 377
 378 Go back to the master branch (uncommitted changes from =mybranch= are
 379 carried over):
 380 : git checkout master
 381
 382 Try out:
 383 : git add --interactive
 384
 385 Good tutorial:
 386 http://www-cs-students.stanford.edu/~blynn//gitmagic/