src/io.py

   1 # io.py, with sentence positions between word locations (i<k<j)
   2
   3 DEBUG = set(['TODO'])
   4
   5 # some of dmv-module bleeding in here... todo: prettier (in inner())
   6 NOBAR = 0
   7 STOP = (NOBAR, -2)
   8 ROOTNUM = -1
   9
  10 def debug(string, level='TODO'):
  11     '''Easily turn on/off inline debug printouts with this global. There's
  12 a lot of cluttering debug statements here, todo: clean up'''
  13     if level in DEBUG:
  14         print string
  15
  16
  17 class Grammar():
  18     '''The PCFG used in the I/O-algorithm.
  19
  20     Public members:
  21     p_terminals
  22
  23     Todo: as of now, this allows duplicate rules... should we check
  24     for this?  (eg. g = Grammar([x,x],[]) where x.prob == 1 may give
  25     inner probabilities of 2.)'''
  26     def all_rules(self):
  27         return self.__p_rules
  28
  29     def rules(self, LHS):
  30         return [rule for rule in self.all_rules() if rule.LHS() == LHS]
  31
  32 #     def nums(self):
  33 #         return self.__numtag
  34
  35     def headnums(self):
  36         return self.__head_nums
  37
  38     def sent_nums(self, sent):
  39         return [self.tagnum(tag) for tag in sent]
  40
  41     def numtag(self, num):
  42         if num == ROOTNUM: # don't want these added to headnums (which we iter through)
  43             return 'ROOT'
  44         else:
  45             return self.__numtag[num]
  46
  47     def tagnum(self, tag):
  48         if tag == 'ROOT':
  49             return ROOTNUM
  50         else:
  51             return self.__tagnum[tag]
  52
  53     def __init__(self, numtag, tagnum, p_rules=[], p_terminals=[]):
  54         '''rules and p_terminals should be arrays, where p_terminals are of
  55         the form [preterminal, terminal], and rules are CNF_Rule's.'''
  56         self.__numtag = numtag
  57         self.__tagnum = tagnum
  58         self.__head_nums = [k for k in numtag.iterkeys()]
  59         self.__p_rules = p_rules # todo: could check for summing to 1 (+/- epsilon)
  60         self.p_terminals = p_terminals
  61
  62
  63
  64
  65 class CNF_Rule():
  66     '''A single CNF rule in the PCFG, of the form
  67     LHS -> L R
  68     where these are just integers
  69     (where do we save the connection between number and symbol?
  70     symbols being 'vbd' etc.)'''
  71     def __eq__(self, other):
  72         return self.LHS() == other.LHS() and self.R() == other.R() and self.L() == other.L()
  73     def __ne__(self, other):
  74         return self.LHS() != other.LHS() or self.R() != other.R() or self.L() != other.L()
  75     def __str__(self):
  76         return "%s -> %s %s [%.2f]" % (self.LHS(), self.L(), self.R(), self.prob)
  77     def __init__(self, LHS, L, R, prob):
  78         self.__LHS = LHS
  79         self.__R = R
  80         self.__L = L
  81         self.prob = prob
  82     def p(self, *arg):
  83         "Return a probability, doesn't care about attachment..."
  84         return self.prob
  85     def LHS(self):
  86         return self.__LHS
  87     def L(self):
  88         return self.__L
  89     def R(self):
  90         return self.__R
  91
  92 def inner(i, j, LHS, g, sent, chart):
  93     ''' Give the inner probability of having the node LHS cover whatever's
  94     between s and t in sentence sent, using grammar g.
  95
  96     Returns a pair of the inner probability and the chart
  97
  98     For DMV, LHS is a pair (bar, h), but this function ought to be
  99     agnostic about that.
 100
 101     e() is an internal function, so the variable chart (a dictionary)
 102     is available to all calls of e().
 103
 104     Since terminal probabilities are just simple lookups, they are not
 105     put in the chart (although we could put them in there later to
 106     optimize)
 107     '''
 108
 109     def O(i,j):
 110         return sent[i]
 111
 112     def e(i,j,LHS):
 113         '''Chart has lists of probability and whether or not we've attached
 114 yet to L and R, each entry is a list [p, Rattach, Lattach], where if
 115 Rattach==True then the rule has a right-attachment or there is one
 116 lower in the tree (meaning we're no longer adjacent).'''
 117         if (i, j, LHS) in chart:
 118             return chart[i, j, LHS]
 119         else:
 120             debug( "trying from %d to %d with %s" % (i,j,LHS) , "IO")
 121             if i+1 == j:
 122                 if (LHS, O(i,j)) in g.p_terminals:
 123                     prob = g.p_terminals[LHS, O(i,j)] # b[LHS, O(s)] in L&Y
 124                 else:
 125                     prob = 0.0
 126                     print "\t LACKING TERMINAL:%s -> %s : %.1f" % (LHS, O(i,j), prob)
 127                 debug( "\t terminal: %s -> %s : %.1f" % (LHS, O(i,j), prob) ,"IO")
 128                 # terminals have no attachment
 129                 return prob
 130             else:
 131                 if (i,j,LHS) not in chart:
 132                     # by default, not attachment yet
 133                     chart[i,j,LHS] = 0.0
 134                 for rule in g.rules(LHS): # summing over rules headed by LHS, "a[i,j,k]"
 135                     debug( "\tsumming rule %s" % rule , "IO")
 136                     L = rule.L()
 137                     R = rule.R()
 138                     for k in range(i+1, j): # i<k<j
 139                         p_L = e(i, k, L)
 140                         p_R = e(k, j, R)
 141                         p = rule.p()
 142                         chart[i, j, LHS] += p * p_L * p_R
 143                 debug( "\tchart[%d,%d,%s] = %.2f" % (i,j,LHS, chart[i,j,LHS]) ,"IO")
 144                 return chart[i, j, LHS]
 145     # end of e-function
 146
 147     inner_prob = e(i,j,LHS)
 148     if 'IO' in DEBUG:
 149         print "---CHART:---"
 150         for k,v in chart.iteritems():
 151             print "\t%s -> %s_%d ... %s_%d : %.1f" % (k[2], O(k[0]), k[0], O(k[1]), k[1], v)
 152         print "---CHART:end---"
 153     return [inner_prob, chart]
 154
 155
 156
 157
 158
 159
 160
 161
 162 if __name__ == "__main__":
 163     print "IO-module tests:"
 164     b = {}
 165     s   = CNF_Rule(0,1,2, 1.0) # s->np vp
 166     np  = CNF_Rule(1,3,4, 0.3) # np->n p
 167     b[1, 'n'] = 0.7 # np->'n'
 168     b[3, 'n'] = 1.0 # n->'n'
 169     b[4, 'p'] = 1.0 # p->'p'
 170     vp  = CNF_Rule(2,5,1, 0.1) # vp->v np (two parses use this rule)
 171     vp2 = CNF_Rule(2,2,4, 0.9) # vp->vp p
 172     b[5, 'v'] = 1.0 # v->'v'
 173
 174     g = Grammar({0:'s',1:'np',2:'vp',3:'n',4:'p',5:'v'},
 175                 {'s':0,'np':1,'vp':2,'n':3,'p':4,'v':5},
 176                 [s,np,vp,vp2], b)
 177
 178 #     print "The rules:"
 179 #     for i in range(0,5):
 180 #         for r in g.rules(i):
 181 #             print r
 182 #     print ""
 183
 184     test1 = inner(0,1, 1, g, ['n'], {})
 185     if test1[0] != 0.7:
 186         print "should be 0.70 : %.3f" % test1[0]
 187         print ""
 188
 189     test2 = inner(0,3, 2, g, ['v','n','p'], test1[1])
 190     if test2[0] != 0.0930:
 191         print "should be 0.0930 : %.4f" % test2[0]
 192     test2 = inner(0,3, 2, g, ['v','n','p'], test2[1])
 193     if test2[0] != 0.0930:
 194         print "should be 0.0930 : %.4f" % test2[0]