src/io.py.~12~

   1 ##################################################################
   2 #                            Changes:                            #
   3 ##################################################################
   4 # 2008-05-24, KBU:
   5 # - more elaborate pseudo-code in the io()-function
   6 #
   7 # 2008-05-25, KBU:
   8 # - CNF_Rule has a function LHS() which returns head, in DMV_Rule this
   9 #   returns the pair of (bars, head).
  10 #
  11 # 2008-05-27
  12 # - CNF_Rule has __eq__ and __ne__ defined, so that we can use == and
  13 #   != on two such rules
  14
  15
  16 # import numpy # numpy provides Fast Arrays, for future optimization
  17 # import pprint # for pretty-printing
  18
  19 DEBUG = 0
  20
  21 # some of dmv-module bleeding in here... todo: prettier (in inner())
  22 NOBAR = 0
  23 STOP = (NOBAR, -2)
  24
  25 def debug(string):
  26     '''Easily turn on/off inline debug printouts with this global. There's
  27 a lot of cluttering debug statements here, todo: clean up'''
  28     if DEBUG:
  29         print string
  30
  31 class CNF_Rule():
  32     '''A single CNF rule in the PCFG, of the form
  33     head -> L R
  34     where these are just integers
  35     (where do we save the connection between number and symbol?
  36     symbols being 'vbd' etc.)'''
  37     def __eq__(self, other):
  38         return self.LHS() == other.LHS() and self.R == other.R and self.L == other.L
  39     def __ne__(self, other):
  40         return self.LHS() != other.LHS() or self.R != other.R or self.L != other.L
  41     def __str__(self):
  42         return "%s -> %s %s [%.2f]" % (self.head, self.L, self.R, self.prob)
  43     def __init__(self, head, L, R, prob):
  44         self.head = head
  45         self.R = R
  46         self.L = L
  47         self.prob = prob
  48     def p(self, *arg):
  49         "Return a probability, doesn't care about attachment..."
  50         return self.prob
  51     def LHS(self):
  52         return self.head
  53
  54 class Grammar():
  55     '''The PCFG used in the I/O-algorithm.
  56
  57     Todo: as of now, this allows duplicate rules... should we check
  58     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  59     inner probabilities of 2.)'''
  60     def rules(self, head):
  61         return [rule for rule in self.p_rules if rule.head == head]
  62
  63     def __init__(self, p_rules, p_terminals):
  64         '''rules and p_terminals should be arrays, where p_terminals are of
  65         the form [preterminal, terminal], and rules are CNF_Rule's.'''
  66         self.p_rules = p_rules # could check for summing to 1 (+/- epsilon)
  67         self.p_terminals = p_terminals
  68
  69
  70
  71 def inner(s, t, LHS, g, sent, chart):
  72     ''' Give the inner probability of having the node LHS cover whatever's
  73     between s and t in sentence sent, using grammar g.
  74
  75     Returns a pair of the inner probability and the chart
  76
  77     For DMV, LHS is a pair (bar, h), but this function ought to be
  78     agnostic about that.
  79
  80     e() is an internal function, so the variable chart (a dictionary)
  81     is available to all calls of e().
  82
  83     Since terminal probabilities are just simple lookups, they are not
  84     put in the chart (although we could put them in there later to
  85     optimize)
  86     '''
  87
  88     def O(s):
  89         return sent[s]
  90
  91     def e(s,t,LHS):
  92         '''Chart has lists of probability and whether or not we've attached
  93 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
  94 Rattach==True then the rule has a right-attachment or there is one
  95 lower in the tree (meaning we're no longer adjacent).'''
  96         if (s, t, LHS) in chart:
  97             return chart[(s, t, LHS)]
  98         else:
  99             debug( "trying from %d to %d with %s" % (s,t,LHS) )
 100             if s == t:
 101                 if (LHS, O(s)) in g.p_terminals:
 102                     prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)]
 103                 else:
 104                     prob = 0.0 # todo: is this the right way to deal with lacking rules?
 105                     print "\t LACKING TERMINAL:"
 106                 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(s), prob) )
 107                 # terminals have no attachment
 108                 return prob
 109             else:
 110                 if (s,t,LHS) not in chart:
 111                     # by default, not attachment yet
 112                     chart[(s,t,LHS)] = 0.0 #, False, False]
 113                 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
 114                     debug( "\tsumming rule %s" % rule )
 115                     L = rule.L
 116                     R = rule.R
 117                     for r in range(s, t): # summing over r = s to r = t-1
 118                         p_L = e(s, r, L)
 119                         p_R = e(r + 1, t, R)
 120                         p = rule.p("todo","todo")
 121                         chart[(s, t, LHS)] += p * p_L * p_R
 122                 debug( "\tchart[(%d,%d,%s)] = %.2f" % (s,t,LHS, chart[(s,t,LHS)]) )
 123                 return chart[(s, t, LHS)]
 124     # end of e-function
 125
 126     inner_prob = e(s,t,LHS)
 127     if DEBUG:
 128         print "---CHART:---"
 129         for k,v in chart.iteritems():
 130             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 131         print "---CHART:end---"
 132     return [inner_prob, chart] # inner_prob == chart[(s,t,LHS)]
 133
 134
 135
 136
 137
 138
 139
 140
 141 if __name__ == "__main__":
 142     print "IO-module tests:"
 143     b = {}
 144     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 145     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 146     b[1, 'n'] = 0.7 # np->'n'
 147     b[3, 'n'] = 1.0 # n->'n'
 148     b[4, 'p'] = 1.0 # p->'p'
 149     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 150     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 151     b[5, 'v'] = 1.0 # v->'v'
 152
 153     g = Grammar([s,np,vp,vp2], b)
 154
 155     print "The rules:"
 156     for i in range(0,5):
 157         for r in g.rules(i):
 158             print r
 159     print ""
 160
 161     test1 = inner(0,0, 1, g, ['n'], {})
 162     if test1[0] != 0.7:
 163         print "should be 0.70 : %.2f" % test1[0]
 164         print ""
 165
 166     DEBUG = 1
 167     test2 = inner(0,2, 2, g, ['v','n','p'], test1[1])
 168     print "should be 0.?? (.09??) : %.2f" % test2[0]
 169     print "------ trying the same again:----------"
 170     test2 = inner(0,2, 2, g, ['v','n','p'], test2[1])
 171     print "should be 0.?? (.09??) : %.2f" % test2[0]
 172
 173
 174 ##################################################################
 175 #            just junk from here on down:                        #
 176 ##################################################################
 177
 178 # def io(corpus):
 179 #     "(pseudo-code / wishful thinking) "
 180 #     g = initialize(corpus) # or corpus.tagset ?
 181
 182 #     P = {('v','n','p'):0.09}
 183 #     # P is used in v_q, w_q (expectation), so each sentence in the
 184 #     # corpus needs some initial P.
 185
 186 #     # --- Maximation: ---
 187 #     #
 188 #     # actually, this step (from Lari & Young) probably never happens
 189 #     # with DMV, since instead of the a[i,j,k] and b[i,m] vectors, we
 190 #     # have P_STOP and P_CHOOSE... or, in a sense it happens only we
 191 #     # calculate P_STOP and P_CHOOSE..somehow.
 192 #     for rule in g.p_rules:
 193 #         rule.num = 0
 194 #         rule.den = 0
 195 #     for pre_term in range(len(g.p_terminals)):
 196 #         ptnum[pre_term] = 0
 197 #         ptden[pre_term] = 0
 198
 199 #     # we could also flip this to make rules the outer loop, then we
 200 #     # wouldn't have to initialize den/num in loops of their own
 201 #     for sent in corpus:
 202 #         for rule in g.p_rules # Equation 20
 203 #             for s in range(len(sent)):
 204 #                 for t in range(s, len(sent)):
 205 #                     rule.num += w(s,t, rule.LHS(),rule.L,rule.R, g, sent, P[sent])
 206 #                     rule.den += v(s,t, rule.LHS(), g, sent, P[sent])
 207 #             # todo: do we need a "new-prob" vs "old-prob" distinction here?
 208 #                     probably, since we use inner/outer which checks rule.prob()
 209 #             # todo: also, this wouldn't work, since for each sentence, we'd
 210 #             #       discard the old probability; should rules be the outer
 211 #             #       loop then?
 212 #             rule.prob = rule.num / rule.den
 213 #         for pre_term in range(len(g.p_terminals)): # Equation 21
 214 #             num = 0
 215 #             den = 0
 216 #             for s in range(len(sent)):
 217 #                 for t in range(s, len(sent)):
 218 #                     num += v(t,t,pre_term, g, sent, P[sent])
 219 #                     den += v(s,t,pre_term, g, sent, P[sent])
 220
 221 #     for rule in g.rules:
 222 #         rule.prob = rule.num / rule.den
 223 #     for pre_term in range(len(g.p_terminals)):
 224 #         g.p_terminals[pre_term] = ptnum[pre_term] / ptden[pre_term]
 225
 226
 227 #     # --- Expectation: ---
 228 #     for sent in corpus: # Equation 11
 229 #         inside = inner(0, len(sent), ROOT, g, sent)
 230 #         P[sent] = inside[0]
 231
 232 #     # todo: set inner.chart to {} again, how?
 233
 234 #     # todo: need a old-P new-P distinction to check if we're below
 235 #     # threshold difference
 236 #     return "todo"
 237
 238 # def w(s,t, LHS,L,R, g, sent, P_sent):
 239 #     w = 0
 240 #     rule = g.rule(LHS, L, R)
 241 #     for r in range(s, t):
 242 #         w += rule.prob() * inner(s,r, L, g, sent) * inner(r+1, t, R, g, sent) * outer(s,t,LHS,g,sent)
 243 #     return w / P_sent
 244
 245 # def v(s,t, LHS, g, sent, P_sent):
 246 #     return ( inner(s,t, LHS, g, sent) * outer(s,t, LHS, g, sent) ) / P_sent
 247
 248
 249
 250
 251