1 # Todo: since we evaluate _after_ we reestimate, we loose the icharts
2 # made while reestimating. If we had these available, evaluate and
3 # corpus_likelihood would be a lot faster, but since they need to run
4 # _after_ reestimate, we'll have to store an ichart per sentence. So
5 # try storing those icharts in some loc_h_dmv global, and see if it's
6 # faster using space rather than time.
8 from common_dmv
import MPPROOT
, GOR
, test
, node_str
9 from wsjdep
import WSJDepCorpusReader
11 def initialize_loc_h(tagonlys
):
12 import loc_h_harmonic
# since we need to change constants (is there a better way?)
13 loc_h_harmonic
.HARMONIC_C
= 0.0
14 loc_h_harmonic
.FNONSTOP_MIN
= 25
15 loc_h_harmonic
.FSTOP_MIN
= 5
16 loc_h_harmonic
.RIGHT_FIRST
= 1.0
17 return loc_h_harmonic
.initialize(tagonlys
)
19 def initialize_cnf(tagonlys
):
20 import cnf_harmonic
# since we need to change constants (is there a better way?)
21 cnf_harmonic
.HARMONIC_C
= 0.0
22 cnf_harmonic
.FNONSTOP_MIN
= 25
23 cnf_harmonic
.FSTOP_MIN
= 5
24 return cnf_harmonic
.initialize(tagonlys
)
27 def test_likelihood(reestimate
, initialize
, inner_sent
, corpus_size
=20, corpus_offset
=1000):
28 def run_IO(g
, iterations
, tagonlys
, tags_and_parses
):
29 print corpus_likelihood(g
, tagonlys
)
30 # print evaluate(g, tags_and_parses) #
31 for i
in range(iterations
):
32 g
= reestimate(g
, tagonlys
)
33 print "reestimation number %d done"%i
34 # print evaluate(g, tags_and_parses) #
35 print corpus_likelihood(g
, tagonlys
)
38 def corpus_likelihood(g
, tagsonly
):
42 p_sent
= inner_sent(g
, sent
, {})
44 print "%s had zero probability!"%sent
47 return "Sum of log P_{sentence}: %.4f (should move towards 0)\n"%sumlog
49 reader
= WSJDepCorpusReader(None)
50 tagonlys
= reader
.tagonly_sents()[corpus_offset
:corpus_offset
+corpus_size
]
51 tags_and_parses
= reader
.tagged_and_parsed_sents()[corpus_offset
:corpus_offset
+corpus_size
]
53 # from loc_h_dmv import testcorpus
54 # tagonlys = testcorpus
56 print "initializing %d sentences..." % corpus_size
,
57 g
= initialize(tagonlys
)
60 g
= run_IO(g
, 4, tagonlys
, tags_and_parses
)
64 def evaluate(g
, tagged_and_parsed_sents
):
66 tagged_and_parsed_sents is a list of pairs:
67 (tagonly_sent, parsed_sent)
69 R_num += 1 if pair from parsed is in mpp
70 R_den += 1 per pair from parsed
72 P_num += 1 if pair from mpp is in parsed
73 P_den += 1 per pair from mpp
75 F1 = (2 * P * R)/(P + R), harmonisk snitt av P og R
77 from loc_h_dmv
import mpp
84 for sent
, parse
in tagged_and_parsed_sents
:
85 mpp_sent
= mpp(g
, sent
)
88 if pair
in mpp_sent
: recall_num
+= 1
90 if pair
[0] == MPPROOT
:
91 continue # todo: add ROOT to parses? (see below)
93 if pair
in parse
: precision_num
+= 1
96 # rooted_parse = add_root(parse) # use? todo
98 # print "No single possible root, todo what?"
100 recall
= float(recall_num
) / float(recall_den
)
101 precision
= float(precision_num
) / float(precision_den
)
103 if (precision
+ recall
) > 0.0:
104 F1
= (2 * recall
* precision
) / (precision
+ recall
)
106 return '''Recall: %d/%d = %.4f
107 Precision: %d/%d = %.4f
108 F1: \t\t%.4f'''%(recall_num
,recall_den
,recall
,precision_num
,precision_den
, precision
, F1
)
113 def compare_loc_h_cnf():
114 reader
= WSJDepCorpusReader(None)
117 tagonlys
= reader
.tagonly_sents()[corpus_offset
:corpus_offset
+corpus_size
]
119 import loc_h_harmonic
, cnf_harmonic
120 g_l
= loc_h_harmonic
.initialize(tagonlys
)
121 g_c
= cnf_harmonic
.initialize(tagonlys
)
124 (g_l
.p_ROOT
.iteritems(), g_c
.p_ROOT
),
125 (g_c
.p_ROOT
.iteritems(), g_l
.p_ROOT
),
126 (g_l
.p_STOP
.iteritems(), g_c
.p_STOP
),
127 (g_c
.p_STOP
.iteritems(), g_l
.p_STOP
),
128 (g_l
.p_ATTACH
.iteritems(), g_c
.p_ATTACH
),
129 (g_c
.p_ATTACH
.iteritems(), g_l
.p_ATTACH
)]
130 for a_items
, b
in initials
:
132 if k
not in b
.keys(): raise Warning, "a[%s]=%s, but %s not in b"(k
,v
,k
)
133 if (k
,v
) not in b
.iteritems(): raise Warning, "a[%s]=%s, but b[%s]=%s"(k
,v
,k
,b
[k
])
136 import loc_h_dmv
, cnf_dmv
137 for sent
in tagonlys
:
138 ochart_l
, ochart_c
, ichart_l
, ichart_c
= {},{},{},{}
139 i_l
= loc_h_dmv
.inner_sent(g_l
, sent
, ichart_l
)
140 i_c
= cnf_dmv
.inner_sent(g_c
, sent
, ichart_c
)
141 test( "%s"%i_l, "%s"%i_c
, "i_l","i_c")
143 for loc_w
,w
in enumerate(sent
):
144 w_node
= (GOR
, g_l
.tagnum(w
))
145 o_l
= loc_h_dmv
.outer(loc_w
,loc_w
+1,w_node
,loc_w
, g_l
, sent
, ichart_l
,ochart_l
)
146 o_c
= cnf_dmv
.outer(loc_w
,loc_w
+1,w_node
, g_c
, sent
, ichart_c
,ochart_c
)
147 print "%s, %s, %s"%(sent
,node_str(w_node
),loc_w
)
148 test("%s"%o_l, "%s"%o_c
, "o_l(0,1,(GOR,%s),%d,...)"%(w
,loc_w
),"o_c")
150 # end compare_loc_h_cnf()
152 if __name__
== "__main__":
154 # compare_loc_h_cnf()
157 print "\ntrying cnf-reestimate ##############################"
158 g
= test_likelihood(cnf_dmv
.reestimate
,
164 print "\ntrying reestimate v.1 ##############################"
165 g
= test_likelihood(loc_h_dmv
.reestimate
,
167 loc_h_dmv
.inner_sent
,
169 print "\ntrying reestimate v.2 ##############################"
170 g
= test_likelihood(loc_h_dmv
.reestimate2
,
172 loc_h_dmv
.inner_sent
,
175 print "main.py: done"