src/dmv.py~20080511~

   1 # import numpy # numpy provides Fast Arrays, for future optimization
   2 import io
   3
   4 # the following will only print when in this module:
   5 if __name__ == "__main__":
   6     print "DMV-module tests:"
   7     a = io.CNF_Rule(3,1,3,0.1)
   8     if(a.head == 3):
   9         print "import io works"
  10
  11 class DMV_Grammar(io.Grammar):
  12     '''The DMV-PCFG.
  13
  14     We need to be able to access rules per mother node, sum every H, every
  15     H_, ..., every H', every H'_, etc. for the IO-algorithm.
  16
  17     What other representations do we need? (P_STOP formula uses
  18     deps_D(h,l/r) at least)'''
  19     def rules(self, (bars, head)):
  20         return [r for r in self.p_rules if r.head == head and r.bars == bars]
  21
  22     def headed_rule_probs(self):
  23         '''Give all rules sorted by head, for use in DMV-specific
  24         probabilities (need deps_D(h, l/r) and so on also)'''
  25         return "todo"
  26
  27     def __init__(self, p_rules, p_terminals):
  28         io.Grammar.__init__(self, p_rules, p_terminals)
  29
  30
  31 class DMV_Rule(io.CNF_Rule):
  32     '''A single CNF rule in the PCFG, of the form
  33     head -> l r
  34     with the added information of bars
  35     '''
  36     def __str__(self):
  37         b = ""
  38         if(self.bars == 1):
  39             b = "_"
  40         if(self.bars == 2): # todo: print prettier?
  41             b = "__"
  42         return b + io.CNF_Rule.__str__(self)
  43
  44     def __init__(self, head, L, R, prob, bars):
  45         io.CNF_Rule.__init__(self, head, L, R, prob)
  46         if bars in [0,1,2]:
  47             self.bars = bars
  48         else:
  49             raise ValueError("bars must be either 0, 1 or 2; was given: %s" % bars)
  50
  51
  52 # the following will only print when in this module:
  53 if __name__ == "__main__":
  54     # these are not Real rules, just testing the classes. todo: make
  55     # a rule-set to test inner() on.
  56     a1 = DMV_Rule(3,1,3, 0.1, 1)
  57     a2 = DMV_Rule(1,3,3, 0.4, 2)
  58     a3 = DMV_Rule(1,1,3, 0.2, 2)
  59     a4 = DMV_Rule(1,1,3, 0.2, 0)
  60     print a1
  61     print a4
  62
  63     b = {}
  64     b[3, 'vbd'] = 0.1
  65     g = DMV_Grammar([a1,a2,a3,a4], b)
  66
  67     for r in g.rules((2,1)):
  68         print r
  69     print "should be 0.1:%.1f" % io.inner(0,0, 3, g, ['vbd'])
  70
  71
  72 def all_inner_dmv(sent, g):
  73     for h in g.heads():
  74         for bars in [0, 1, 2]:
  75             for s in range(s, len(sent)): # summing over r = s to r = t-1
  76                 for t in range(s+1, len(sent)):
  77                     io.inner(s, t, (bars, h))
  78
  79
  80
  81 # DMV-probabilities:
  82
  83 # def DMV(sentence):
  84 #     '''Here it seems like they store bracketing information on a per-head
  85 #     (per direction) basis, in deps_D(h, dir) which gives us a list. '''
  86 #     P_D = array indexed on h`s from sentence
  87 #     for dir from {l,r}:
  88 #         for a from deps_D(h, dir):
  89 #             P_D[h] *= \
  90 #                 P_STOP (notSTOP | h, dir, adj) * \
  91 #                 P_CHOOSE (a | h,dir) * \
  92 #                 P (D(a)) * \
  93 #                 P_STOP (STOP | h, dir, adj)
  94 #             return P_D_h
  95
  96 # def P_STOP(STOP | h, dir, adj):
  97 #     P_STOP_num = 0
  98 #     for s from S:
  99 #         for i<loc(h): # where h is in the sentence
 100 #             for k:
 101 #                 P_STOP_num += c(h-r : i, k)
 102 #     P_STOP_den = 0
 103 #     for s from S:
 104 #         for i<loc(h):
 105 #             for k:
 106 #                 P_STOP_den += c(l-h-r : i, k)
 107 #     return P_STOP_num / P_STOP_den
 108
 109
 110
 111 ##################################################################
 112 #            just junk from here on down:                        #
 113 ##################################################################
 114
 115 def initialize(taglist):
 116     '''If H and A are non-equal tags, we want every rule
 117     of these forms:
 118     _H_ -> STOP H_
 119     H_ -> H STOP
 120     H_ -> _A_ H
 121     H  -> H _A_
 122
 123     to have some initial probability. It's supposed to be harmonic,
 124     which we get from an M-step of the IO-algorithm; do we need a full
 125     rule count? -- Where we check the distance from H to A in each case,
 126     and the probability is C/distance + C' ... but then what constants
 127     do we need in order to sum to 1?
 128
 129     p.4 in Carroll & Charniak gives the number of DG rules for
 130     sentences of length n as n*(2^{n-1}+1).
 131
 132         "harmonic" completion where all non-ROOT words took the same
 133         number of arguments, and each took other words as arguments in
 134         inverse proportion to (a constant plus) the distance between
 135         them. The ROOT always had a single argument and took each word
 136         with equal probability. (p.5 in DMV-article)
 137
 138     ---
 139
 140     Given their model, we have these types of probabilities:
 141
 142     - P_STOP(-STOP | h, dir, adj)
 143     - P_STOP(STOP | h, dir, adj)
 144     - P_CHOOSE(a | h, dir)
 145
 146     and then each has to sum to 1 given certain values for h, dir, adj.
 147     (so P_STOP(-STOP | VBD, L, non-adj)+P_STOP( STOP | VBD, L, non-adj)=1;
 148     and summing P_CHOOSE(a | VBD, L), over all a, equals 1).
 149
 150     '''
 151     rules = [ ] # should be a simple array where indexes correspond to
 152                 # heads h, so if h->_a_ h_, then h == heads[i] iff
 153                 # rules[i] == (_a_, h_)
 154     heads = ['ROOT']
 155     for h in taglist:
 156
 157         rules['ROOT'] = ('ROOT', h)
 158         rules['_' + h + '_'] = ('STOP', h + '_') # _H_ -> STOP H_
 159         rules[      h + '_'] = ('h'   , 'STOP' ) # H_ -> H STOP
 160         for a in taglist:
 161             if a != h :
 162                 rules[h] = (h      , '_' + a + '_') # H  ->  H _A_
 163                 rules[h + '_'] = ('_' + a + '_', h + '_') # H_ -> _A_ H_
 164
 165     # not sure what kind of rule-access we'll need
 166     # (dictionary?).. also, rules should be stored somewhere "globally
 167     # accessible"..
 168
 169     # put the probabilities in a single array, indexed on rule-number,
 170     # (anticipating the need for quick and simple access)
 171     p_rules = numpy.zeros( (len(rules)) )
 172
 173     return (p_rules, rules)
 174
 175
 176 def bracket(tags):
 177     p = tuple([(tag, 1) for tag in tags])
 178     test = numpy.zeros((len(tags)+1, len(tags)+1))
 179     return test
 180
 181 #if __name__ == "__main__":
 182     # pprint.pprint(initialize(['vbd', 'nn', 'jj', 'dt']))
 183     # print bracket(("foo", "bar", "fie","foe"))