3 # - prettier printout for DMV_Rule
4 # - DMV_Rule changed a bit. head, L and R are now all pairs of the
6 # - Started on P_STOP, a bit less pseudo now..
9 # - started on initialization. So far, I have frequencies for
10 # everything, very harmonic. Still need to make these into 1-summing
14 # - more work on initialization (init_freq and init_normalize),
15 # getting closer to probabilities now.
18 # - init_normalize is done, it creates p_STOP, p_ROOT and p_CHOOSE,
19 # and also adds the relevant probabilities to p_rules in a grammar.
20 # Still, each individual rule has to store both adjacent and non_adj
21 # probabilities, and inner() should be able to send some parameter
22 # which lets the rule choose... hopefully... Is this possible to do
23 # top-down even? when the sentence could be all the same words?
24 # todo: extensive testing of identical words in sentences!
25 # - frequencies (only used in initialization) are stored as strings,
26 # but in the rules and p_STOP etc, there are only numbers.
29 # - copied inner() into this file, to make the very dmv-specific
30 # adjacency stuff work (have to factor that out later on, when it
34 # - finished typing in inner_dmv(), still have to test and debug
35 # it. The chart is now four times as big since for any rule we may
36 # have attachments to either the left or the right below, which
37 # upper rules depend on, for selecting probN or probA
40 # import numpy # numpy provides Fast Arrays, for future optimization
48 # non-tweakable/constant "lookup" globals
55 # (is this the best way to represent ROOT and STOP?)
57 # todo: use these instead for attachment constants. Requires making
58 # the last two indices of chart[] one single pair, boring retyping and
63 NOATT = (False, False)
65 if __name__ == "__main__":
66 print "DMV module tests:"
68 class DMV_Grammar(io.Grammar):
72 p_STOP, p_ROOT, p_CHOOSE, p_terminals
73 These are changed in the Maximation step, then used to set the
74 new probabilities of each DMV_Rule.
76 Todo: make p_terminals private? (But it has to be changable in
77 maximation step due to the short-cutting rules... could of course
78 make a DMV_Grammar function to update the short-cut rules...)
80 __p_rules is private, but we can still say stuff like:
81 for r in g.all_rules():
84 What other representations do we need? (P_STOP formula uses
85 deps_D(h,l/r) at least)'''
88 for r in self.all_rules():
95 return [r for r in self.all_rules() if r.head() == head and r.bars() == bars]
98 '''Not sure yet what is needed here, or where this is needed'''
101 def deps_L(self, head):
102 # todo test, probably this list comprehension doesn't work
103 return [a for r in self.all_rules() if r.head() == head and a == r.L()]
105 def deps_R(self, head):
106 # todo test, probably this list comprehension doesn't work
107 return [a for r in self.all_rules() if r.head() == head and a == r.R()]
109 def __init__(self, p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag):
110 io.Grammar.__init__(self, p_rules, p_terminals, numtag)
112 self.p_CHOOSE = p_CHOOSE
116 class DMV_Rule(io.CNF_Rule):
117 '''A single CNF rule in the PCFG, of the form
119 where LHS = (bars, head)
127 Different rule-types have different probabilities associated with
130 _h_ -> STOP h_ P( STOP|h,L, adj)
131 _h_ -> STOP h_ P( STOP|h,L,non_adj)
132 h_ -> h STOP P( STOP|h,R, adj)
133 h_ -> h STOP P( STOP|h,R,non_adj)
134 h_ -> _a_ h_ P(-STOP|h,L, adj) * P(a|h,L)
135 h_ -> _a_ h_ P(-STOP|h,L,non_adj) * P(a|h,L)
136 h -> h _a_ P(-STOP|h,R, adj) * P(a|h,R)
137 h -> h _a_ P(-STOP|h,R,non_adj) * P(a|h,R)
139 def p(self, LRattach, RLattach, *arg):
140 '''Returns the correct probability, adjacent or non-adjacent,
141 depending on whether or not there is a some lower attachment
142 either on the right side of the left child, or the left side
143 of the right child. '''
144 if (not LRattach) and (not RLattach):
155 def __init__(self, LHS, L, R, probN, probA):
156 for b_h in [LHS, L, R]:
157 if b_h[0] not in BARS:
158 raise ValueError("bars must be in %s; was given: %s" % (BARS, b_h[0]))
159 io.CNF_Rule.__init__(self, LHS, L, R, probN)
160 self.probA = probA # adjacent
161 self.probN = probN # non_adj
163 @classmethod # so we can call DMV_Rule.bar_str(b_h)
164 def bar_str(cls, b_h):
165 str = " %d " % b_h[1]
167 str = " %d_ " % b_h[1]
169 str = "_%d_ " % b_h[1]
177 return "%s-->%s %s\t[%.2f] [%.2f]" % (self.bar_str(self.LHS()),
178 self.bar_str(self.L()),
179 self.bar_str(self.R()),
189 ###################################
190 # dmv-specific version of inner() #
191 ###################################
192 def rewrite_adj(bars, Lattach, Rattach):
193 # todo: make prettier? Although since we call this so many times,
194 # having it spelled out here is probably faster
195 if bars == NOBAR and not Lattach and Rattach:
196 return ( (Lattach, False, False, False),
197 (Lattach, False, False, True),
198 (Lattach, False, True, False),
199 (Lattach, False, True, True),
200 (Lattach, True, False, False),
201 (Lattach, True, False, True),
202 (Lattach, True, True, False),
203 (Lattach, True, True, True), )
204 elif bars == RBAR and Lattach:
205 # Rattach may be either true or false here!
206 return ( (False, False, False, Rattach),
207 (False, False, True, Rattach),
208 (False, True, False, Rattach),
209 (False, True, True, Rattach),
210 (True, False, False, Rattach),
211 (True, False, True, Rattach),
212 (True, True, False, Rattach),
213 (True, True, True, Rattach) )
215 # NOBAR rewrite rules cannot have Lattach below, and must
216 # have/add Rattach. RBAR rewrite rules must add Lattach, but
217 # don't care about Rattach. Returning () should ensure we
218 # don't add any probability to such "false" situations
221 def inner_dmv(s, t, LHS, g, sent, chart):
222 ''' A rewrite of inner in io.py, to take adjacency into accord.
224 The chart is now 4 times bigger, since there are different values
225 for with or without L/R attachments:
226 chart[(s,t,LHS, Lattach, Rattach)]
228 If Rattach==True then the rule has a right-attachment or there is
229 one lower in the tree (meaning we're no longer
230 adjacent). Adjacency depends on whether there is an attachment
231 lower in the tree, cf. DMV_Rule.p(LRattach, RLattach).
233 Todo: make this work, then, if possible, refactor (move
234 dmv-specific stuff back into dmv, so this is "general" again)
236 Or, if that doesn't work, we might as well make it a method of
239 def debug_inner_dmv(tabs,s,t,LHS,Lattach,Rattach):
242 (True, True): "left and right attachments below",
243 (True, False): "left attachment(s) below",
244 (False, True): "right attachment(s) below",
245 (False, False): "no attachments below" }
246 info = (tabs,O(s),s,O(t),t, DMV_Rule.bar_str(LHS), attach[Lattach,Rattach])
247 print "%sTrying from %s_%d to %s_%d with %s, %s:" % info
252 def e(s,t,LHS, Lattach, Rattach, n_t):
256 if (s, t, LHS, Lattach, Rattach) in chart:
257 return chart[(s, t, LHS, Lattach, Rattach)]
259 debug_inner_dmv(tab(),s,t,LHS, Lattach, Rattach)
261 if Lattach or Rattach:
262 # terminals are always F,F for attachment
263 io.debug("%s= 0.0 (1 word, no lower attach)" % tab())
265 elif (LHS, O(s)) in g.p_terminals:
266 prob = g.p_terminals[LHS, O(s)] # b[LHS, O(s)] in Lari&Young
268 # todo: assuming this is how to deal with lacking
269 # rules, since we add prob.s, and 0 is identity
271 print "%sLACKING TERMINAL:" % tab()
272 # todo: add to chart perhaps? Although, it _is_ simple lookup..
273 io.debug( "%s= %.1f (terminal: %s -> %s" % (tab(),prob,LHS,O(s)) )
276 if (s,t,LHS,Lattach, Rattach) not in chart:
277 chart[(s,t,LHS,Lattach,Rattach)] = 0.0
278 for rule in g.rules(LHS): # summing over j,k in a[LHS,j,k]
279 io.debug( "%ssumming rule %s" % (tab(),rule) )
282 # if it's a STOP rule, rewrite for the same range:
283 if (L == STOP) or (R == STOP):
285 p = rule.p(Lattach, False) # todo check
286 pLR = e(s, t, R, Lattach, Rattach, n_t+1)
288 p = rule.p(False, Rattach) # todo check
289 pLR = e(s, t, L, Lattach, Rattach, n_t+1)
290 chart[(s, t, LHS, Lattach, Rattach)] += p * pLR
292 # not a STOP, an attachment rewrite:
294 for r in range(s, t):
295 # LL etc are boolean attachment values
296 for (LL, LR, RL, RR) in rewrite_adj(rule.bars(), Lattach, Rattach):
297 p = rule.p(LR, RL) # probN or probA
298 pL = e(s, r, L, LL, LR, n_t+1)
299 pR = e(r+1, t, R, RL, RR, n_t+1)
300 chart[(s, t, LHS,Lattach,Rattach)] += p * pL * pR
302 # io.debug( "\tchart[(s:%d, t:%d, %s ,L:%s, R:%s)] = %.2f" % (s,t,
303 # DMV_Rule.bar_str(LHS),
306 # chart[(s,t,LHS,Lattach,Rattach)]) )
307 return chart[(s, t, LHS,Lattach,Rattach)]
310 inner_prob = e(s,t,LHS,True,True, 0) + e(s,t,LHS,True,False, 0) + e(s,t,LHS,False,True, 0) + e(s,t,LHS,False,False, 0)
313 for k,v in chart.iteritems():
314 print "\t%s -> %s_%d ... %s_%d (L:%s, R:%s):\t%.3f" % (DMV_Rule.bar_str(k[2]),
318 print "---CHART:end---"
319 return [inner_prob, chart]
323 if __name__ == "__main__": # Non, Adj
324 _h_ = DMV_Rule((LRBAR,0), STOP, ( RBAR,0), 1.0, 1.0) # LSTOP
325 h_S = DMV_Rule(( RBAR,0),(NOBAR,0), STOP, 0.4, 0.3) # RSTOP
326 h_A = DMV_Rule(( RBAR,0),(LRBAR,0),( RBAR,0), 0.6, 0.7) # Lattach
327 h = DMV_Rule((NOBAR,0),(NOBAR,0),(LRBAR,0), 1.0, 1.0) # Rattach
329 b2[(NOBAR, 0), 'h'] = 1.0
330 b2[(RBAR, 0), 'h'] = h_S.probA
331 b2[(LRBAR, 0), 'h'] = h_S.probA * _h_.probA
333 g2 = DMV_Grammar([ _h_, h_S, h_A, h ],b2,0,0,0, {0:'h'})
336 test1 = inner_dmv(0, 1, (LRBAR,0), g2, 'h h'.split(), {})
337 print "Should be 0.183: %.3f" % test1[0]
340 ##############################
341 # DMV-probabilities, todo: #
342 ##############################
344 def P_STOP(STOP, h, dir, adj, corpus):
345 '''corpus is a list of sentences s.
347 This is based on the formula where STOP is True... not sure how we
348 calculate if STOP is False.
351 I thought about instead having this:
353 for rule in g.p_rules:
357 for rule in g.p_rules:
360 set num and den using inner
361 for rule in g.p_rules
362 rule.prob = rule.num / rule.den
364 ..the way I'm assuming we do it in the commented out io-function in
365 io.py. Having sentences as the outer loop at least we can easily just
366 go through the heads that are actually in the sentence... BUT, this
367 means having to go through p_rules 3 times, not sure what is slower.
370 P_STOP(-STOP|...) = 1 - P_STOP(STOP|...)
377 # here we should somehow make each word in the sentence
378 # unique, decorate them with subscripts or something. We have
379 # to run through the sentence as many times as h appears
380 # there. This also means changing inner(), I suspect. Have to
381 # make sure we separate reading of inner_prob from changing of
383 for s in range(loc(h)): # i<loc(h), where h is in the sentence.
384 for t in range(i, len(sent)):
385 P_STOP_num += inner(s, t, h-r, g, sent, chart)
386 P_STOP_den += inner(s, t, l-h-r, g, sent, chart)
387 return P_STOP_num / P_STOP_den # possibly other way round? todo
394 '''Here it seems like they store rule information on a per-head (per
395 direction) basis, in deps_D(h, dir) which gives us a list. '''
398 for dir in ['l', 'r']:
399 for a in deps(h, dir):
402 P_STOP (0, h, dir, adj) * \
403 P_CHOOSE (a, h, dir) * \
405 P_STOP (STOP | h, dir, adj)
407 return P_h(root(sent))
411 ##############################
412 # Initialization, todo #
413 ##############################
415 '''sents is of this form:
416 [['tag', ...], ['tag2', ...], ...]
418 Return a list of the tags. (Has to be ordered for enumerating to be
421 Fortunately only has to run once.
428 raise ValueError("it seems we must have a new ROOT symbol")
435 def init_zeros(tags):
436 "Return a frequency dictionary with DMV-relevant keys set to 0 / {}."
441 for dir_adj in ['LN','LA','RN','RA']:
442 f[tag, 'STOP', dir_adj] = 0
443 f[tag, '-STOP', dir_adj] = 0
446 f[tag, 'sum', 'R'] = 0.0
447 f[tag, 'sum', 'L'] = 0.0
450 def init_freq(corpus, tags):
451 '''Returns f, a dictionary with these types of keys:
452 - ('ROOT', tag) is basically just the frequency of tag
453 - (tag, 'STOP', 'LN') is for P_STOP(STOP|tag, left, non_adj);
454 etc. for 'RN', 'LA', 'LN', '-STOP'.
455 - (tag, 'L') is a dictionary of arg:f, where head could take arg
456 to direction 'L' (etc. for 'R') and f is "harmonically" divided
457 by distance, used for finding P_CHOOSE
460 1. counts word frequencies for f_ROOT
461 2. adds to certain f_STOP counters if a word is found first,
462 last, first or second, or last or second to last in the sentence
463 (Left Adjacent, Left Non-Adjacent, etc)
464 3. adds to f_CHOOSE(arg|head) a "harmonic" number (divided by
465 distance between arg and head)
469 for sent in corpus: # sent is ['VBD', 'NN', ...]
471 # NOTE: head in DMV_Rule is a number, while this is the string
472 for i_h, head in enumerate(sent):
473 # todo grok: how is this different from just using straight head
474 # frequency counts, for the ROOT probabilities?
476 f['sum', 'ROOT'] += 1
478 # True = 1, False = 0. todo: make prettier
479 f[head, 'STOP', 'LN'] += (i_h <= 1) # first two words
480 f[head, '-STOP', 'LN'] += (not i_h <= 1)
481 f[head, 'STOP', 'LA'] += (i_h == 0) # very first word
482 f[head, '-STOP', 'LA'] += (not i_h == 0)
483 f[head, 'STOP', 'RN'] += (i_h >= n - 2) # last two words
484 f[head, '-STOP', 'RN'] += (not i_h >= n - 2)
485 f[head, 'STOP', 'RA'] += (i_h == n - 1) # very last word
486 f[head, '-STOP', 'RA'] += (not i_h == n - 1)
488 # this is where we make the "harmonic" distribution. quite.
489 for i_a, arg in enumerate(sent):
491 harmony = 1.0/abs(i_h - i_a) + HARMONIC_C
496 if arg not in f[head, dir]:
497 f[head, dir][arg] = 0.0
498 f[head, dir][arg] += harmony
499 f[head, 'sum', dir] += harmony
500 # todo, optimization: possible to do both directions
501 # at once here, and later on rule out the ones we've
502 # done? does it actually speed things up?
506 def init_normalize(f, tags, tagnum, numtag):
507 '''Use frequencies (and sums) in f to return create p_STOP and
508 p_CHOOSE; at the same time adding the context-free rules to the
509 grammar using these probabilities.
511 Return a usable grammar.'''
513 p_STOP, p_ROOT, p_CHOOSE, p_terminals = {},{},{},{}
514 for n_h, head in enumerate(tags):
515 p_ROOT[n_h] = float(f['ROOT', head]) / f['sum', 'ROOT']
516 p_rules.append( DMV_Rule(ROOT, (LRBAR,n_h), STOP,
520 # p_STOP = STOP / (STOP + NOT_STOP)
521 for dir in ['L','R']:
522 for adj in ['N','A']:
523 p_STOP[n_h, dir+adj] = \
524 float(f[head, 'STOP', dir+adj]) / \
525 (f[head, 'STOP', dir+adj] + f[head, '-STOP', dir+adj])
526 # make rule using the previously found probN and probA:
527 p_rules.append( DMV_Rule((RBAR, n_h), (NOBAR, n_h), STOP,
528 p_STOP[n_h, dir+'N'],
529 p_STOP[n_h, dir+'A']) )
531 # inner() shouldn't have to deal with those long non-branching stops:
532 p_terminals[(NOBAR, n_h), head] = 1.0
533 p_terminals[(RBAR, n_h), head] = p_STOP[n_h, 'RA']
534 p_terminals[(LRBAR, n_h), head] = p_STOP[n_h, 'RA'] * p_STOP[n_h, 'LA']
536 for dir in ['L', 'R']:
537 for arg, val in f[head, dir].iteritems():
538 p_CHOOSE[tagnum[arg], n_h, dir] = float(val) / f[head,'sum',dir]
540 # after the head tag-loop, add every head-argument rule:
541 for (n_a, n_h, dir),p_C in p_CHOOSE.iteritems():
542 if dir == 'L': # arg is to the left of head
543 p_rules.append( DMV_Rule((RBAR,n_h), (LRBAR,n_a), (RBAR,n_h),
544 p_C*(1-p_STOP[n_h, dir+'N']),
545 p_C*(1-p_STOP[n_h, dir+'A'])) )
547 p_rules.append( DMV_Rule((NOBAR,n_h), (LRBAR,n_a), (NOBAR,n_h),
548 p_C*(1-p_STOP[n_h, dir+'N']),
549 p_C*(1-p_STOP[n_h, dir+'A'])) )
551 return DMV_Grammar(p_rules, p_terminals, p_STOP, p_CHOOSE, p_ROOT, numtag)
554 def initialize(corpus):
555 '''Return an initialized DMV_Grammar
556 corpus is a list of lists of tags.'''
557 tags = taglist(corpus)
558 tagnum, numtag = {}, {}
559 for num, tag in enumerate(tags):
562 # f: frequency counts used in initialization, mostly distances
563 f = init_freq(corpus, tags)
564 g = init_normalize(f, tags, tagnum, numtag)
568 if __name__ == "__main__":
569 # print "--------initialization------------"
570 # print initialize([['foo', 'two','foo','foo'],
571 # ['zero', 'one','two','three']])
585 # todo: some testing on the Brown corpus:
587 if __name__ == "__main__":
588 # first five sentences of the Brown corpus:
589 g_brown = initialize([['AT', 'NP-TL', 'NN-TL', 'JJ-TL', 'NN-TL', 'VBD', 'NR', 'AT', 'NN', 'IN', 'NP$', 'JJ', 'NN', 'NN', 'VBD', '``', 'AT', 'NN', "''", 'CS', 'DTI', 'NNS', 'VBD', 'NN', '.'], ['AT', 'NN', 'RBR', 'VBD', 'IN', 'NN', 'NNS', 'CS', 'AT', 'NN-TL', 'JJ-TL', 'NN-TL', ',', 'WDT', 'HVD', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', '``', 'VBZ', 'AT', 'NN', 'CC', 'NNS', 'IN', 'AT', 'NN-TL', 'IN-TL', 'NP-TL', "''", 'IN', 'AT', 'NN', 'IN', 'WDT', 'AT', 'NN', 'BEDZ', 'VBN', '.'], ['AT', 'NP', 'NN', 'NN', 'HVD', 'BEN', 'VBN', 'IN', 'NP-TL', 'JJ-TL', 'NN-TL', 'NN-TL', 'NP', 'NP', 'TO', 'VB', 'NNS', 'IN', 'JJ', '``', 'NNS', "''", 'IN', 'AT', 'JJ', 'NN', 'WDT', 'BEDZ', 'VBN', 'IN', 'NN-TL', 'NP', 'NP', 'NP', '.'], ['``', 'RB', 'AT', 'JJ', 'NN', 'IN', 'JJ', 'NNS', 'BEDZ', 'VBN', "''", ',', 'AT', 'NN', 'VBD', ',', '``', 'IN', 'AT', 'JJ', 'NN', 'IN', 'AT', 'NN', ',', 'AT', 'NN', 'IN', 'NNS', 'CC', 'AT', 'NN', 'IN', 'DT', 'NN', "''", '.'], ['AT', 'NN', 'VBD', 'PPS', 'DOD', 'VB', 'CS', 'AP', 'IN', 'NP$', 'NN', 'CC', 'NN', 'NNS', '``', 'BER', 'JJ', 'CC', 'JJ', 'CC', 'RB', 'JJ', "''", '.'], ['PPS', 'VBD', 'CS', 'NP', 'NNS', 'VB', '``', 'TO', 'HV', 'DTS', 'NNS', 'VBN', 'CC', 'VBN', 'IN', 'AT', 'NN', 'IN', 'VBG', 'CC', 'VBG', 'PPO', "''", '.'], ['AT', 'JJ', 'NN', 'VBD', 'IN', 'AT', 'NN', 'IN', 'AP', 'NNS', ',', 'IN', 'PPO', 'AT', 'NP', 'CC', 'NP-TL', 'NN-TL', 'VBG', 'NNS', 'WDT', 'PPS', 'VBD', '``', 'BER', 'QL', 'VBN', 'CC', 'VB', 'RB', 'VBN', 'NNS', 'WDT', 'VB', 'IN', 'AT', 'JJT', 'NN', 'IN', 'ABX', 'NNS', "''", '.'], ['NN-HL', 'VBN-HL'], ['WRB', ',', 'AT', 'NN', 'VBD', 'PPS', 'VBZ', '``', 'DTS', 'CD', 'NNS', 'MD', 'BE', 'VBN', 'TO', 'VB', 'JJR', 'NN', 'CC', 'VB', 'AT', 'NN', 'IN', 'NN', "''", '.'], ['AT', 'NN-TL', 'VBG-TL', 'NN-TL', ',', 'AT', 'NN', 'VBD', ',', '``', 'BEZ', 'VBG', 'IN', 'VBN', 'JJ', 'NNS', 'CS', 'AT', 'NN', 'IN', 'NN', 'NNS', 'NNS', "''", '.']])
590 # 36:'AT' in g_brown.numtag, 40:'NP-TL'
591 test_brown = inner_dmv(0, 1, (LRBAR,36), g_brown, ['AT', 'NP-TL'], {})
592 print "Brown-test gives: %.3f" % test_brown[0]
595 # this will give the tag sequences of all the 6218 Brown corpus
596 # sentences of length < 7:
597 # [[tag for (w, tag) in sent]
598 # for sent in nltk.corpus.brown.tagged_sents() if len(sent) < 7]
602 "472 tags, takes a while to extract with tagset(), hardcoded here."
603 return set(['BEDZ-NC', 'NP$', 'AT-TL', 'CS', 'NP+HVZ', 'IN-TL-HL', 'NR-HL', 'CC-TL-HL', 'NNS$-HL', 'JJS-HL', 'JJ-HL', 'WRB-TL', 'JJT-TL', 'WRB', 'DOD*', 'BER*-NC', ')-HL', 'NPS$-HL', 'RB-HL', 'FW-PPSS', 'NP+HVZ-NC', 'NNS$', '--', 'CC-TL', 'FW-NN-TL', 'NP-TL-HL', 'PPSS+MD', 'NPS', 'RBR+CS', 'DTI', 'NPS-TL', 'BEM', 'FW-AT+NP-TL', 'EX+BEZ', 'BEG', 'BED', 'BEZ', 'DTX', 'DOD*-TL', 'FW-VB-NC', 'DTS', 'DTS+BEZ', 'QL-HL', 'NP$-TL', 'WRB+DOD*', 'JJR+CS', 'NN+MD', 'NN-TL-HL', 'HVD-HL', 'NP+BEZ-NC', 'VBN+TO', '*-TL', 'WDT-HL', 'MD', 'NN-HL', 'FW-BE', 'DT$', 'PN-TL', 'DT-HL', 'FW-NR-TL', 'VBG', 'VBD', 'VBN', 'DOD', 'FW-VBG-TL', 'DOZ', 'ABN-TL', 'VB+JJ-NC', 'VBZ', 'RB+CS', 'FW-PN', 'CS-NC', 'VBG-NC', 'BER-HL', 'MD*', '``', 'WPS-TL', 'OD-TL', 'PPSS-HL', 'PPS+MD', 'DO*', 'DO-HL', 'HVG-HL', 'WRB-HL', 'JJT', 'JJS', 'JJR', 'HV+TO', 'WQL', 'DOD-NC', 'CC-HL', 'FW-PPSS+HV', 'FW-NP-TL', 'MD+TO', 'VB+IN', 'JJT-NC', 'WDT+BEZ-TL', '---HL', 'PN$', 'VB+PPO', 'BE-TL', 'VBG-TL', 'NP$-HL', 'VBZ-TL', 'UH', 'FW-WPO', 'AP+AP-NC', 'FW-IN', 'NRS-TL', 'ABL', 'ABN', 'TO-TL', 'ABX', '*-HL', 'FW-WPS', 'VB-NC', 'HVD*', 'PPS+HVD', 'FW-IN+AT', 'FW-NP', 'QLP', 'FW-NR', 'FW-NN', 'PPS+HVZ', 'NNS-NC', 'DT+BEZ-NC', 'PPO', 'PPO-NC', 'EX-HL', 'AP$', 'OD-NC', 'RP', 'WPS+BEZ', 'NN+BEZ', '.-TL', ',', 'FW-DT+BEZ', 'RB', 'FW-PP$-NC', 'RN', 'JJ$-TL', 'MD-NC', 'VBD-NC', 'PPSS+BER-N', 'RB+BEZ-NC', 'WPS-HL', 'VBN-NC', 'BEZ-HL', 'PPL-NC', 'BER-TL', 'PP$$', 'NNS+MD', 'PPS-NC', 'FW-UH-NC', 'PPS+BEZ-NC', 'PPSS+BER-TL', 'NR-NC', 'FW-JJ', 'PPS+BEZ-HL', 'NPS$', 'RB-TL', 'VB-TL', 'BEM*', 'MD*-HL', 'FW-CC', 'NP+MD', 'EX+HVZ', 'FW-CD', 'EX+HVD', 'IN-HL', 'FW-CS', 'JJR-HL', 'FW-IN+NP-TL', 'JJ-TL-HL', 'FW-UH', 'EX', 'FW-NNS-NC', 'FW-JJ-NC', 'VBZ-HL', 'VB+RP', 'BEZ-NC', 'PPSS+HV-TL', 'HV*', 'IN', 'PP$-NC', 'NP-NC', 'BEN', 'PP$-TL', 'FW-*-TL', 'FW-OD-TL', 'WPS', 'WPO', 'MD+PPSS', 'WDT+BER', 'WDT+BEZ', 'CD-HL', 'WDT+BEZ-NC', 'WP$', 'DO+PPSS', 'HV-HL', 'DT-NC', 'PN-NC', 'FW-VBZ', 'HVD', 'HVG', 'NN+BEZ-TL', 'HVZ', 'FW-VBD', 'FW-VBG', 'NNS$-TL', 'JJ-TL', 'FW-VBN', 'MD-TL', 'WDT+DOD', 'HV-TL', 'NN-TL', 'PPSS', 'NR$', 'BER', 'FW-VB', 'DT', 'PN+BEZ', 'VBG-HL', 'FW-PPL+VBZ', 'FW-NPS-TL', 'RB$', 'FW-IN+NN', 'FW-CC-TL', 'RBT', 'RBR', 'PPS-TL', 'PPSS+HV', 'JJS-TL', 'NPS-HL', 'WPS+BEZ-TL', 'NNS-TL-HL', 'VBN-TL-NC', 'QL-TL', 'NN+NN-NC', 'JJR-TL', 'NN$-TL', 'FW-QL', 'IN-TL', 'BED-NC', 'NRS', '.-HL', 'QL', 'PP$-HL', 'WRB+BER', 'JJ', 'WRB+BEZ', 'NNS$-TL-HL', 'PPSS+BEZ', '(', 'PPSS+BER', 'DT+MD', 'DOZ-TL', 'PPSS+BEM', 'FW-PP$', 'RB+BEZ-HL', 'FW-RB+CC', 'FW-PPS', 'VBG+TO', 'DO*-HL', 'NR+MD', 'PPLS', 'IN+IN', 'BEZ*', 'FW-PPL', 'FW-PPO', 'NNS-HL', 'NIL', 'HVN', 'PPSS+BER-NC', 'AP-TL', 'FW-DT', '(-HL', 'DTI-TL', 'JJ+JJ-NC', 'FW-RB', 'FW-VBD-TL', 'BER-NC', 'NNS$-NC', 'JJ-NC', 'NPS$-TL', 'VB+VB-NC', 'PN', 'VB+TO', 'AT-TL-HL', 'BEM-NC', 'PPL-TL', 'ABN-HL', 'RB-NC', 'DO-NC', 'BE-HL', 'WRB+IN', 'FW-UH-TL', 'PPO-HL', 'FW-CD-TL', 'TO-HL', 'PPS+BEZ', 'CD$', 'DO', 'EX+MD', 'HVZ-TL', 'TO-NC', 'IN-NC', '.', 'WRB+DO', 'CD-NC', 'FW-PPO+IN', 'FW-NN$-TL', 'WDT+BEZ-HL', 'RP-HL', 'CC', 'NN+HVZ-TL', 'FW-NNS-TL', 'DT+BEZ', 'WPS+HVZ', 'BEDZ*', 'NP-TL', ':-TL', 'NN-NC', 'WPO-TL', 'QL-NC', 'FW-AT+NN-TL', 'WDT+HVZ', '.-NC', 'FW-DTS', 'NP-HL', ':-HL', 'RBR-NC', 'OD-HL', 'BEDZ-HL', 'VBD-TL', 'NPS-NC', ')', 'TO+VB', 'FW-IN+NN-TL', 'PPL', 'PPS', 'PPSS+VB', 'DT-TL', 'RP-NC', 'VB', 'FW-VB-TL', 'PP$', 'VBD-HL', 'DTI-HL', 'NN-TL-NC', 'PPL-HL', 'DOZ*', 'NR-TL', 'WRB+MD', 'PN+HVZ', 'FW-IN-TL', 'PN+HVD', 'BEN-TL', 'BE', 'WDT', 'WPS+HVD', 'DO-TL', 'FW-NN-NC', 'WRB+BEZ-TL', 'UH-TL', 'JJR-NC', 'NNS', 'PPSS-NC', 'WPS+BEZ-NC', ',-TL', 'NN$', 'VBN-TL-HL', 'WDT-NC', 'OD', 'FW-OD-NC', 'DOZ*-TL', 'PPSS+HVD', 'CS-TL', 'WRB+DOZ', 'CC-NC', 'HV', 'NN$-HL', 'FW-WDT', 'WRB+DOD', 'NN+HVZ', 'AT-NC', 'NNS-TL', 'FW-BEZ', 'CS-HL', 'WPO-NC', 'FW-BER', 'NNS-TL-NC', 'BEZ-TL', 'FW-IN+AT-T', 'ABN-NC', 'NR-TL-HL', 'BEDZ', 'NP+BEZ', 'FW-AT-TL', 'BER*', 'WPS+MD', 'MD-HL', 'BED*', 'HV-NC', 'WPS-NC', 'VBN-HL', 'FW-TO+VB', 'PPSS+MD-NC', 'HVZ*', 'PPS-HL', 'WRB-NC', 'VBN-TL', 'CD-TL-HL', ',-NC', 'RP-TL', 'AP-HL', 'FW-HV', 'WQL-TL', 'FW-AT', 'NN', 'NR$-TL', 'VBZ-NC', '*', 'PPSS-TL', 'JJT-HL', 'FW-NNS', 'NP', 'UH-HL', 'NR', ':', 'FW-NN$', 'RP+IN', ',-HL', 'JJ-TL-NC', 'AP-NC', '*-NC', 'VB-HL', 'HVZ-NC', 'DTS-HL', 'FW-JJT', 'FW-JJR', 'FW-JJ-TL', 'FW-*', 'RB+BEZ', "''", 'VB+AT', 'PN-HL', 'PPO-TL', 'CD-TL', 'UH-NC', 'FW-NN-TL-NC', 'EX-NC', 'PPSS+BEZ*', 'TO', 'WDT+DO+PPS', 'IN+PPO', 'AP', 'AT', 'DOZ-HL', 'FW-RB-TL', 'CD', 'NN+IN', 'FW-AT-HL', 'PN+MD', "'", 'FW-PP$-TL', 'FW-NPS', 'WDT+BER+PP', 'NN+HVD-TL', 'MD+HV', 'AT-HL', 'FW-IN+AT-TL'])