From 5b03c5de2eeb9cdffbc9271eb0fe05c5e84520ba Mon Sep 17 00:00:00 2001 From: Kevin Brubeck Unhammer Date: Sat, 27 Sep 2008 10:12:18 +0200 Subject: [PATCH] Update to remove faulty gold parses. Seems Done now :) --- report/report.pdf | Bin 170226 -> 170384 bytes report/report.tex | 68 +++++++++++++++++++++++++++++++++--------------------- src/main.py | 55 ++++++++++++++++++++++--------------------- src/wsjdep.py | 4 ++-- 4 files changed, 73 insertions(+), 54 deletions(-) diff --git a/report/report.pdf b/report/report.pdf index 6d37c40cf64c1239c24cd9e9fd49bb7e5d89c99b..5d071330ee43bc30b0d9c993ac2de1507a70012d 100644 GIT binary patch delta 8026 zcwU89c|4Te+c#rvkZgmBhU_uTea}7j+_#WDLI~MXwk(ycvP29DAqr_!B)cqyilRbN zktIoyEfgY!$gX!h&-;7c_c8D1^UgnW&G%gAe6Q;|*PQb`GiA6d_i?2?BUltn4)0W; zZ8Z+)(fje^XlBQ?ptc*$A~i{socRVvkJ`TK>*|q6+Ai;V(syT~qCw}x+7delLAUY2 zQ1R;T$xIRc^L5P*Bo+?VvANF1xE)Spv;8@a1#(s&Szb!DH%jl1H}zG0+$6Vyb?4AG zBLju=e0RI=esWz}92dW2Ato4y8l;ZgTZ)xzTC(B;oYP(#(Hmi}lhm9;nd)tjlZ9A0b zIfv}kI>A)Qf!76~6i-as%eLa=_q*>UU%0mKg*W}D!^aDUE#_Q%wyY)de8uW@htA+d zv}xrNRknisZAoks#l5aYo#L7I%ZnZMh^t*50LnaLXIx+B8ONCPD(laRa7q<}{C#XvRlo3`CD0NqsE{eW`{E2f*Tcu=D)&CNM32sYOSxUghEBmRy>IsQF6^Y0N;YNr=dKjpO@_h2{^4`K&6i_NJ%gD~rY$C6rb)n~Ler68uUc_yVtqh%5x_rj|F^|H3(B zN90NMmgIk^ld@B;*rl#h>wUaZet|3VddekEXuE3NFI?w`HZ1P2r`;`FYfgjUynSZ{ zO=378V~_KYgXPtuTZPP0-<`VX@`lBoS_Cv&MFl^_9KJTHD)pm{tFCZg`$=kN?8je2 z-g-vk{@?fxMT1=y#^%kMl`7HvanQB?qdJ+UvbRJN^&q9KzZ4I3y*S{yM{!UjEi#d$ z!bW%2ap^KnufJYWy8LCbv+<`Kt}989f}!mZlzdc90=4!{yOgOfF4!D6HOAH&X^`Al zB4@u7Z49A2R$YA6c;e@d`2|2xD8|UK+*-8$*$?`+=DacS!EKR>3z~hSS{_~Z;#Eb! z{RzH~#li=3S@yn;0{6>%$~n4Uz+le8p`ojT-&g1NK3kX??zvCh_r<~7(P2rhv2L6u z)VinFt+v+j?erML|Dk|YO+EO}$j}Oxv#gNsiqX027Sd6|@ANYhrW;NzFYY?kVH=jB zE=W@mP;pHE;M2cvVfmZ)=zy{IRFaFicPmZpd5)2e(1e@QK@a|b569|)^io~;kA_W* zKaTU9nsLN?;;!9p>9GFJQ!6Wpgk4Bopt;!`wwk*ks?e0BTrNzR$?}+RN(iM*ILAak zHWWIS@jCrL$;Eos9}fhQ9@VjE$w zZ2e+pmsIFBc2QGzQ>j-*<83Epm!#ibko$Z3>j|6rb9CHA7 z9n>k)O;vjuip;#*WG(Xqi87o1GkeQ)u%mP7tXFK`o$I4BMSO*Rvn2_0kG<}V&d_F; zQk^EOQ|0!?>}k-OdzkL@s-Ek0?VAeoljjB+v}95bw?9|>{njeAt%a0A?l2P=qhEEZ zAC(|D8I2s26$uVeR2nMgms+w=Bn-yrz22Lr!c$&VaL!3MXrb}YO5D2*RE}=t z%8AECXG^lS8)!OKJ)ZSY3WeP4^~U{a-}XmvJf=#&KAq9V{nZiPLQ;49z^xxH6E+$l z;rKnY!Y*GWEIKb^Q|+pFo-2*2p;2C|X($n5!a+K2mSTJ}MOjul9>ml*pJkzi48g;G z*KAsp9I$*f89DEd9pLQsbRd)#e|b?zZ*Drak~ZMtIdS}&sP9B%>O&s0s;*HwZO>wh zeMhXC0q?z~Oeq7g+xY#$&>NiY>7FguUSMhult>G6sh+aCcE;!6QqG;9YQ^}4)}*DS z#US0QNvD_c=a0QBt17Z{@BC`TpViHRH1FSe_s#wj59$4T8-(sT@?*P=!G)!g2^Ll$zc8JcW1aMxB^R=Vf<;zAZ{kPpn-^wiS=uh_@56$oht_`f27*Q{M-g zbDfJImlPHHp&EGKO^-0N(lghNNWJb+IAP=)ul8~Cs$ff*p@o6cqX%Qjf$6%3-laKQ zK{KDI)zd8}5^R?{67;Lp<$EW+;B>B=shSe%q4Ip*dNWfF@WFgee8QHVt74OV_d#7hU*6YXQczj?iC)Q|)Tb#IFPok2_7yHS&!Uv9yz6Mbcro zIEm&8&xafhhf7;oam`NEK8x0Xn-$;tK(b4Gjd|Jh^r^BH7pI=JZ*jr0n=Nic@dN=_ zGdVLBgDjR(oy3E=nEkU?V-2@yvNgJm&s5xNj31^eZ4Lj

)hBttYQK>Pc>GfN5Am z@Gb6Aqx&vlKiGkTm4gwF_6yPiD1i473fjFi=|#T2zfc@XsO6`h55AJX_H%)=!Q&3bp$<<umd#*(pi6 z2bVU_O4oDGKXZpvy|dTmTC1=U56q8vJ+$e2#-gh>S?FtQ2=G8FF`J+6jxzg|)jYmu zb9QORt7tCqsHU-oQDJzKoF0Bhx8xHKVRhF*gI2kclcNKFSi8;tDBV9XEr!_vAOI1C zQ33^_5@7%!00=^aAPNB{0Yoy0COaw$CvT3!B|o1zCQL?&FqJ|8U@DOeBLp%9AVOfW zqyj$q;I9+HREP*5AOT>YR1_e92!#ldliTm%|C&GZ>zFVE5fL&#Ad?UxnF13)3QVLR z$rv$w@{|%+^0PU?d%9sx@dk7_;_j9Q5mLSveUdw@{w}Uz0PgHK>JuV&D z>bgTZOoMf7c!D)>&c|;xjCuI6Nke&MX7VI3~H0pW!>$TFm$`H8>ia!LOOc#H((#c`+<~{5-KC{N#n`k) z>P@zU?~P4we?e8TOWN6n+I_Fy+6t;%F!}EO1xjv@ABeFSavM?hTy3x%OwEYA!gGP8 zYL9dNOPeeYj7~yT%8@v+vxfMAtuuF2KXnE}YZ7eZ+j-SK+*-`K_+3Y_3FAnsxqFt! z|Ng9JpEVz=r0nJ2w<)&#hl!V)LbvB>7`^*UgzQ+%`zr@csLB3*11&w{KX|e>O;>je zZ_YJwdMEWDU&?==82x@gu=e^WceSPGoq3mDenf0ndFs$rd23sjU5&9@a%N-#74l9^ z{dOx9+pg-YW&a|)GI@U3TF=;6g_eCUfOqKGkFkm5i^FVvNvtEH@n0L<##zo4WZ%@@ z8$q#M5i+xxgTGghS?Vf&@I)DIpOR$xR3d4MQzn^;J-r`3a8(%G<0j(FtynOT-^w9; zdF!Fe_i=vI`G}Ea$1%@I9`357U3XJ0K2P7?ReIA%bhOR~*9#{MkNFLxw7AgP-*V%k zbU#0ToJAQ3h79gTNZmHC)ONPjD0mgid%nPpw0FULTU3}Gb^g*v;WKslhR!9Mpj?Sw zC<^VV%p5(h^j$BuULjv3JGTfp=Fz-o(`C{F>2}BCkZ2)%P4C1#GCyVOrG9bQ5Yle< z3$MEq4YD``#(@XUv?yb9tB@L6Hc&3GcSJzNej(Mh3j97~ygDh+W_fX{O~LQQmoU|@ zVs~u5Oh?$+-Y9isU6N9EZrii-L3Pl+^tE+m6NNrv^fokG%U_r*Vl*e(x`>VHt;>PA@FyCyjx# zH7i5z`wDj@KD}OT9;2xzRb87hx;x`dm&UtEjm6LUw{}J9PGPqM-q7$u@g&;rj{;I7 zmklc-K#H2w2zxpZmTjYQGj4m~`KyK*b{8~s<9udcRhRBO^hKnh%3|Lyt5#lWUB8d5 zSsFJjKk z%X_%5J`{cQ^z`+cbF!m0vl5BUz6s#|q0@5?RpJ+Tbs7a*_$NqxJ)dLJB5}ZIrgVE z2ItRLohElzknN6)7`ZOzk&Y=*tnED-)*sHJwTUaq*;q}{Fy+#Z`u9C=*G{xZp3~C& z@(bEx>K-MgwAAzXNTm0U*i5zRCPnW~H0$K=6U72N#%aC2j@@7GcUk$uB8UClCepiS zlk5EtcxOU~o0Y7)UUpB<%vO2Rq8nRe`c@q-aoP!2jBCAK(_XWFeia+}M$l+b%WtP3 zn{iSU=7^AQY2<76^9R2^zZrkx((XsGV|anUm|HgkzsQ+#KFJZIqF4CuZwt+Am1!Hr zmfSu#{>EW{A&y$IcCd1e7IF5<(1XyCvG1nmf28n}@9Pz>NFn>gtAtSU%yE_qKtL zdp}LQJMt(_NICboHnz%UYSF6*JP1Z{%*Y-}&I-HZgh-+>8nnjr6wn$I+3QGyy zzPba{QD8-Kp6(xzHT&#m^CGRGk!CQRWLR7yX;3MF2L8#YFcv?asuB9q^p1iyYh7mm zpH+>0=F1`B?W`hCwktxn~-mHpK%6SsX=v(S&UhOd6)S6Ge3e0hT5RE5Gw&+aPimu*M=9Hckj(#E~ij|t~FYg3@lb_MPCLo zoVNuW125FJm!L=XJf4X>l9(1*A@$n|KfY*cyP!NRcY!fz$2Jc1`i)c0?BVOk1t0PU^$lrtYS_ZtSccm(H3a(t6e4W5Y6*QOlb zQ)7I^;%$9pLFi8-YP9WfT2-$9-f``1{kdIEk6NVP3?9MC-G6fBj1Y>|nya~xeLdh| zDh9*|lgUI9d0k~OzDW=PhF~I<@*mYl|AWCC!vpKWiA(@clp)L*%8(2YK{Aw_Ikzp@ zA$lwQ9u{NyA0H4z7-9=$@-bn>=!%e03Xuwv2@pe(K~$6gkr>Jhfarr97(IFo8%CKf z!G_ThAtN9WB{POW3Xudcgdj?0NIc3vZl~5`CWDlJ)*&+#Bbh?~%z@drKF&u^W5+lP zg8)O0G9qH=SQKSAkx4`{@^2>zdNc<{7r(w775ry85TXzPki5?KVm&VeBr{@Qh)Iwk zA{lv6sDF*o=qg+oCr(DTL@I>Pv$-$`2MiJc0HhCcVW{gih(L6SO&BUW2oph&Znp^o zbCRhr5hlU(R0hrgkQrJVq<5|lfDC#v6`>2@Flrcbv?&f#X^O)Zp5mmEPzWTUbW8|# z`+t5^N{3)&F|1Sym2Mo0HD-m$5Iud}pui;h_# zMMV&X*55FQpa`Wh{sPtmD}o}VjiXeAQAXZqth5pWP{>SkDJ1H@F%nE&U(Y@oi^0$U z1l~9UX3#NVR0@+H6=ud@7$9$CLm2NQCX59AHJ9FZ5o^y%CIfV}7_1{>vq+VhF<471 zD-4n=7vr!(7&a=S_v!rcSZOu{AiM_41ho=-8cy=Ljy=C z(*%&r#KCxnGQ~$GZ`=hEM5ZH)O5N1Cz z1!7(gA^q=QXn%t!$ke@5h|07eOof;-rZO6x38NrP#0ZF>8#!PSjm(q@0>R9Ui~!74 zKp+Y;Mr9&q941UXfs!bHR{{PPh%!6E%)N@D5dG0rtSpVn)L2x2#1s@2V7&h~Vj%Mt zGHT0|B;#1$STz!LJ*bVnLuFhI8@GmvLQD-xMX5{`N60WU29YQmgFz_Bzc~Kt9RwyZ zkuY|Pvat#XLPj@|Fm?vnNW$2ejfnsxVEtm>h#|C%RZZBSXLgN21JU2g6t(0Qk8mVA?QF_Ndmjdj3R84k6ZT1l6rUBKvaJJ_!7ptN#x`xtu{K?L(ry;vW*VC!U zD+UVsUF8b9FL#40E_)V+297;CcHP{&Y^W05IDV(Zmg`w-j8t(@-rf5ndoJ*mUHa|} zWzZ+sohMG6YTb?643fQl>e0_ViF<04AKsGk$RpU5?t3^SG&$kqMG@0@xGOb6@i=#O zaO}_7yexxmmnC2HN%imN5uqtkC&(@iw#I_Xg8)euxe-P2K*?EJ4!Tr)meK0j_{Hky_@ zue{%UJxMM)$2az@$O}a&i*A(%fp5H?6cR8UrEnG0-Ti*9@#9XQ^&2*(fW-LN| zN-6s5+N$Ka01M1iUWOxfkJhP;MRA3l)@puBdu>XbG@5?*Nm{I(MSplRaR=jz+Sly6 z3-YZhu5}x-vwT0|vm;slLrSsm9r+WzcFE$WhmTlR*=35BJqwPaK5w3hEn5$ZvFxbd zv}tO;6`|$t9kBW$C1`Uo<#sz@FR>AIl!vI6UR2@u_vrU`5(yZTsN?iZPoi65>gVG< z#XZ0MR$HhAf(P*nX-k*xoAn5NOdD|N8{vV@id>5o?dSibDW>bJaP#t+yzuIXTXttI zAfTJPg4_!C?l_&da8GHhyYhWwa2~q0^yP(luZ<>#P1dqyMJy`V@kmPw&tN#O#G_%f zmum8Xy=Dua=eqQgu0wQLUk=H}y1FYtmbdU(-I2Ek@HA_4?LZCOUA%}ksr?czfnwbP zHD1$g!5_M=sAwngmKfy0bFojR*STe#cjqEzP-|=UQ*HH|#L8Z}*j8YWx03ka1G}>l zrYH8h+h4ua7sQu&;S=)B{xsR;BZ|sbe<3C%8#FcvJt3C)s^=ve;v%6_&rF6dMJ8q| z*X-G!Uy6NStdzXdY&9(&_3A_Z5uFBFfH>qUq+qlx==)NM;1MDof^ccw8DDMYu%|8V zDTmVYLkDz~!{zp;@`zb;1afvg3m6Z{NUAf?AD77il$bmn@eS(bEaq2k-Sm z!JnpI3*+}DZ7ipM>ZL6{d!0^5z90WyNbJBt{+%b4Zwo0tejtp#^&>)VxLekOwc>_C zj)j`G^8FWwsJc-v-q8YYUhK~4YQIMEB*YUGzYlb^YjmV`ZQY`#ytqyCtD`Qm&$Ktm ze@N}f6sj8Tv^90OXewaa-RH23V|#Y(k}2e{vmSR|-6=?|NI%kvCd753dFH#<4{ldy z_pt{rn|4^9x~8nMr7W5y68$V+Rrdjp(8$Q1AEDN3$K`feqPNPcujPnlH6DnLXv9Yg zjNZ|23s=>>%to)+VHtY;%wYxTNWpfC>Fz_*-Jn5R{~>WzRLRZEJwiti*e4w}2TK-X zA@Bu7Y3;sBMvMMQC*MYmK+Msc1-2RKh*z!O_IS0VC8%ws8cR)>9G~B;ecUK33a#|E z1?PRRIu`Qbul3oNf{M-pzESh@f`-4p z&anR|@%QZ6RJ&WZs9H5?_@iO`%wH~0OCWm)pjPsWeonx|>AQl)ai28Ty0!=_#bWh+*#kkg?oxo{m zD9dc=Tj5gF7j(!OGkTfhkE;W}w2KzYcf53q>j(}>()}968h%=5i7LX1+l4KYrtE#AT+>!x**sD?2EO*!1$@jYQY(sw zJ96G-{0yL5@E!bRw{(H)r2YcWNgi~QzYSXE^f{#j353Y*r&TIVA^TLFn<1YHrHZ2& zA2J0v9wCd9f&z^F5x7y`6;WZ&3GdxCO{%LZwdsB9I!b1C7uu}EPL?jNIzRCfQ-mI_ zcMd3z%L!TDKQQ%_Lr*6gk)(%Q5Yj)KsK56(WpAuc49Cm0P>+iJ?Xlx@Rf*sYQC0iH zsq5s>EYmCM!hUBn*YrnnvL&B|#TynuRi=fyscZo=Uk4Qa)`=3n{P*2!wcTkUh?!E~ zaICq4WX~gSZwh5+n#;lZ5^PWWl(h|H{S!chY2Es9J@0lghMrRH!D# z+Ch~?G5lucuA4=)TjaG7MfL%tfi(+>&xg zIBdF`bF|50a;QpiBgq5R_r;388cNHYQ$D1XpUhkD8yxrhkvmYcKS);I{tF_kk$d^W zk1p=aL^t#|=}WElUkkO$K7MSO5a5d~pV-@Dvb$&vqKl-bBNA`k=3p@o|Blfst)S|D za254$QW=vL6OjDvo&C#=PwA{72W0HNqa3)F;x8Q=AfwcDqt>DS>5AK=(+{ZWHP-`A zZEdBj@BASq!FHanYQw$zv4{A_xL(LntV69;6V!`mLNCgFh%kYMA85VGoa9;TsjR7+ zP7mry4R5g-IzE@(E)r9Ry??ATyW=juW`OQJPx-lcA;Y7>^=prxE3A~Z`&TC);d*O@ zqPt}jaEM60fRK-FBhg&;ZWlbuVitvwyNN^smPDdp$T&O}PsU@2L@E|XBw)yR01HxR z&HSP?zGyDm$dbJXiGT&E6bznB!4k}gj02NCDU_Sr>OTm%gP!crvRWaJ!%87p{6bhCE;u$EFY(x?~Os4Hy z1KB_TOT?3DF0cuwzHR2O<*?xK_&*x`mc=k_s6r5GY}6&DIpv9S8w!jKPP9vRADIDt zdMA6z-QldXEA1(%(W4N*@`X_D?MzUOL3j*Oq$GId72M#r;>r^7M`Q#?o`o%f9on z6K8rTVFz5_F0aoXmAmXUT7K%_cpARZa>k*_`PZ9@*2IsRccpr%#@bO*Vyd^~T^Gj3 zfYer`)!L)(ms8?LdZOES-YWDrcR@6lJT0GxFJIU57Y$^M-^tA}eR)QY5-5KnQra@P zeAaw0)HTpZsCi7F$+uwk1Jzg~!2gn@_pT>lvVrfGfS129ixZryr^j3D1qWu4u}}~T zZ``4^BC!TlS#?@eR9nhug$7%#<*y1i=NE|rW_s*>65W}y$b7;pkljIzw;0mRk>_Cb z4{X*N=&mL`Sw@i8Q zna`;GF=3SBFE*RKZx-ZJRG&JmwLQsXsZ&b)bmH}38LN(-YId=FZvWt2zTi>5Fa5}&g)62aZ%yvS zcp7yYyAOeJIdq4-ns?b*=F8l)rm$$-BzwFdT)fBfUr&8eim`kZzFQe(E?fUG!K}d4Yu}Z?T(y&I@woDP{g>y2 z-{ub_=M@cF3VCx~(v)=ce%%|PJ2N9Ew-}P1UDMS)NsE2fp%iY?`yl_gm+kl*aT0R+==wL^mBb*_q8^p@e3e@=-%0 z2U29lM4E`=8r~7%=ud_x*C4jGN(o&#Ht~~{o1>V(iD8A#soR(KHRo~~%ZNbXh2E2; zh#i`IKD$?zbDwBNAtw}z?g#Vn*zD%+`(225eq|3o+Xb@-;Asy&RO()!`>QiD589N( zHe~$n3$U0~w;}s(rW(%QH3y`H^$db-TT(1M&S@vRigiOleE~Q#jymb^Rh~-%`;Phv zvEi5Z2?A`fI(mn-#WW6>rA3rMtTKD}I0c>d-{susjnUNp*rsB@f=F+koIR`Tx11Vb z$cq$8*x1Y6uM$+@GR!kq7SizD`sYA@RDHhpg=+qiEq=JIy%kuUKp*^pSHLm z;l&^LIXB4^Ec_Vcs3ezc_H*f)3dZ_(xq~uavKsP>_$>Zf`!?G_;N(!!{cvQSj;uJ zm1qtPZTGHMog$m^=B!cID4X4M0H6-6sOU3^3GwP7{xq*B{cyCxqLyfS_jUuMNs z!=LkJx2uVTGl$GN|MYo?_qfpVz8-Rd#RskjKS#=)u*%!=W z#KBj`srp0Py-@f7_sKWL>{Yikt*_YJ85$Zu+||iE)Ke5@-dSH?JmanEC-i;UAi04{ zI6C9}y$y%r1ZwB*?kyY113RU^W+i>+wlL%PA>!PlJ!GVMSho$@;>orLwy zYMpETv|Q+=gcco5$nocgBK>{0iO+BB?va~a3ue2 z14pLNJA+WhtnhSzK;ISo#{|hB-6a?vrT_rFcH1Ds`Ey@~4HA`1x4Ql(41r#-Z4jv- zef^ICuh%%_PZ)rHciX^|aP)5>f5H&y>Y=D3Xjre4>CvGmHBo?w!EOIAZWylZCmt>u zIN_mC6jh9ZV;%$vluDH_loLt}BoG)!DP;I)3&K%ysuU`f#55Nc=Kqbs%LWPL%Jpy* z5_tee02ybHDFBWMgZG6gEGpAP)C`1O+K{-$>Ls7CePWAKEsEctWLC z6zUX;1t60u=`ko_B$^DuyIm5Cl0#Fd09ZK~i^@W>;3zn{e?00PJe5#so`7;dqHrL+ zF%jhrZC^WvNCZ5Y8G~anVE~!Tgpmn2CX7M>m@w+UHjeKc1AcjFA0ml@8NMKG5{yYR3F)|D@VFUs*2H(PM4EisS!dNiyIT3J7fk8Z# z$svit%p;QUObda_^FJ31{R@Oqrh0-u_n5NY?xu_mVKNg2Fs~Tg?&}OONFYRECK7SX z6-*{E?*bX5Fkuwt6@ws|DQOU-{;U2D|0hT#G7;gTXDU=G1!N2Yw@w^g_ZCV40-2Jc z;0bs}zi{7XIzqU?F)fk;P#AX&M}hA%6A%7?2N~;$LM1S^2^b?W?Ks?%{tpbKG6n(R zzQgDbZu|cg<4-+-aOY#(1-NMv8G}%9cmiV(Dvrn)1dqcL35=Z-hllSPBSs=ZOofai zk%^2L9;7g1R2;~V3?9CuWX5EO%w>Wn5%83M9TW7QAc?V%;fjYZB_jrsDa;s!#Q5HU zTMLEp29a?L#SLwPc(@)IiFg9@WIRA*=8<4U;F6cuKW4!7KZC>G_W%F@ diff --git a/report/report.tex b/report/report.tex index d3b94ac..d630da1 100644 --- a/report/report.tex +++ b/report/report.tex @@ -1,4 +1,4 @@ -% merged Kevin's & Emily's reports 2008-09-21 Sun 10:39 with Emerge +% merged Kevin's & Emily's reports 2008-09-21 Sun 19:04 with Emerge \documentclass[11pt,a4paper]{article} \usepackage[utf8]{inputenc} @@ -534,44 +534,39 @@ next section discusses the effect of these. \subsection{Results} We compared the results of the implementation with a dependency parsed version of the WSJ-10 corpus (converted from the manually annotated -version). Since single word sentences were not POS-tagged there, these -were skipped. Also, the dependency parsed WSJ-10 did not have ROOT -nodes; so we calculated precision and recall both without counting our -ROOT links, and with counting ROOT links, by adding these to the gold -parses where possible\footnote{221 parses in the dependency parsed - WSJ-10 had several tokens appearing as heads without appearing as - dependents in the same parse, here we skipped the parses when - calculating with ROOT links.}. +version using a Perl script by Valentin Jijkoun). Since single word +sentences were not POS-tagged there, these were skipped. Also, the +dependency parsed WSJ-10 did not have ROOT nodes; so we calculated +precision and recall both without counting our ROOT links, and with +counting ROOT links, by adding these to the gold parses where +possible\footnote{221 parses in the dependency parsed WSJ-10 had + several tokens appearing as heads without appearing as dependents in + the same parse, here we skipped the parses when calculating with + ROOT links. Our gold standard also sometimes (for 1249 sentences) + had one dependent with two heads, these were skipped from + evaluation. We have not yet had time to run evaluation of undirected + dependencies.}. \begin{table*}[hb] \centering \begin{tabular}{l|ccc} Model & P & R & F1 \\ \hline - LBRANCH/RHEAD & 25.6 & 32.6 & 28.7 \\ - RANDOM & 31.0 & 39.4 & 34.7 \\ - RBRANCH/LHEAD & 55.1 & 70.0 & 61.7 \\ - K\&M's DMV & 46.6 & 59.2 & 52.1 \\ + LBRANCH/RHEAD & & & 33.6 \\ + RANDOM & & & 30.1 \\ + RBRANCH/LHEAD & & & 24.0 \\ + K\&M's DMV & & & 43.2 \\ Our DMV: \\ Uniform initial distribution & 21.0 (18.7) & 20.1 (18.1) & 20.5 (18.4) \\ - $C_A=0; C_S=1;C_N=0.1;C_M=1$ & 23.7 (23.7) & 24.8 (24.5) & 24.2 (24.1) \\ - $C_A=0; C_S=1;C_N=0.1;C_M=10$ & 26.7 (31.6) & 25.5 (30.5) & 26.1 (31.0) \\ + $C_A=0; C_S=1;C_N=0.1;C_M=1$ & 24.8 (24.5) & 23.7 (23.7) & 24.2 (24.1) \\ % local screen + $C_A=0; C_S=1;C_N=0.1;C_M=10$ & 26.7 (31.6) & 25.5 (30.5) & 26.1 (31.0) \\ % uib screen % $C_A=10;C_S=1;C_N=0.1;C_M=10$ & 25.6 (30.6) & 24.5 (29.6) & 25.0 (30.1) \\ - $C_A=10;C_S=1;C_N=3 ;C_M=10$ & 26.0 (31.0) & 24.9 (30.0) & 25.5 (30.5) \\ - $C_A=15;C_S=3;C_N=1 ;C_M=20$ & 26.7 (31.7) & 25.6 (30.6) & 26.2 (31.2) \\ + $C_A=10;C_S=1;C_N=3 ;C_M=10$ & & & 26.3 (31.4) \\ + $C_A=15;C_S=3;C_N=1 ;C_M=20$ & & & 27.2 (32.2) \\ \end{tabular} \caption{DMV results on the WSJ-10 for various initialization values (numbers in parentheses are when counting added ROOT links)} \label{tab:dmv-wsj} \end{table*} -% trying locally: ? -% HARMONIC_C: 115.854497176, STOP_C: 1.47684590293, NSTOP_C: -% 4.27464793921, FSTOP_MIN: 6.9710245489 -% P: 11627/45444 = 0.255853357979 | P_r: 15710/51712 = 0.303797957921 -% R: 11627/47419 = 0.245197072903 | R_r: 15710/53466 = 0.29383159391 -% F1: 0.250411897096 | F1_r: 0.298731673924 -% -% trying remotely: uniform distribution -% todo: fix results when done We tried various values for the initialization constants; but it was hard to find any clear pattern for what worked best. @@ -596,6 +591,27 @@ not clear whether their DMV-experiment was run using automatically induced word classes \citep[Schütze, 1995, in][p.~8]{km-dmv} or on the tagset used to manually annotate the WSJ-10. +% underproposed = # $C_A=10;C_S=1;C_N=3 ;C_M=10$ +% {(('NN', 1), ('DT', 0)): 347, +% (('NN', 2), ('DT', 0)): 148, +% (('NN', 3), ('DT', 2)): 136, +% (('NN', 4), ('DT', 3)): 144, +% (('NN', 5), ('DT', 4)): 128, +% (('NN', 6), ('DT', 5)): 124, +% (('NNP', 1), ('NNP', 0)): 358, +% (('NNP', 2), ('NNP', 0)): 125, +% (('NNP', 2), ('NNP', 1)): 174, +% (('NNS', 1), ('JJ', 0)): 124, +% (('ROOT', -1), ('NN', 0)): 100, +% (('ROOT', -1), ('NN', 1)): 106, +% (('ROOT', -1), ('NNP', 1)): 140, +% (('ROOT', -1), ('NNP', 2)): 104, +% (('VBD', 2), ('NN', 1)): 145, +% (('VBP', 2), ('NNS', 1)): 111, +% (('VBZ', 2), ('NN', 1)): 152, +% (('VBZ', 2), ('NNP', 1)): 107, +% (('VBZ', 3), ('NN', 2)): 109, } + In the stochastic grammars given by the model after EM, the POS class $VBD$ (past tense verb) had the highest probability of being attached to by ROOT, followed by $VBZ$ (3sg present) and $VBP$ (non-3sg diff --git a/src/main.py b/src/main.py index 7b90b7d..a4384ba 100644 --- a/src/main.py +++ b/src/main.py @@ -16,11 +16,11 @@ def initialize_loc_h(tagonlys): # loc_h_harmonic.HARMONIC_C = 380.111684914 # loc_h_harmonic.FSTOP_MIN = 13.5744632704 # loc_h_harmonic.FNONSTOP_MIN = 34.8939452454 - loc_h_harmonic.HARMONIC_C = 0.0 #120.0 * random.random() # 509.63 - loc_h_harmonic.STOP_C = 1.0 #3.0 * random.random() - loc_h_harmonic.NSTOP_C = 0.1 #5.0 * random.random() # 0.1 - loc_h_harmonic.FSTOP_MIN = 1.0 #20.0 * random.random() # 13.08 - + loc_h_harmonic.HARMONIC_C = 0.0 #120.0 * random.random() # 509.63 + loc_h_harmonic.STOP_C = 1.0 #3.0 * random.random() + loc_h_harmonic.NSTOP_C = 0.1 #5.0 * random.random() # 0.1 + loc_h_harmonic.FSTOP_MIN = 10.0 #20.0 * random.random() # 13.08 +# $C_A=0; C_S=1;C_N=0.1;C_M=10$ loc_h_harmonic.RIGHT_FIRST = 1.0 loc_h_harmonic.OLD_STOP_CALC = False print ''' @@ -53,14 +53,14 @@ def test_likelihood(reestimate, initialize, inner_sent, sumlog,msg = corpus_likelihood(g, tagonlys) print msg if EVAL: - E = evaluate(g, tags_and_parses) - print E + g.E = evaluate(g, tags_and_parses) + print g.E for i in range(iterations): g = reestimate(g, tagonlys) print "reestimation number %d done\n"%i if EVAL: - E = evaluate(g, tags_and_parses) - print E + g.E = evaluate(g, tags_and_parses) + print g.E prev_sumlog = sumlog sumlog,msg = corpus_likelihood(g, tagonlys) if sumlog < prev_sumlog: @@ -70,14 +70,8 @@ def test_likelihood(reestimate, initialize, inner_sent, from pickle import dump # let us say g = pickle.load(open('..','rb')) filehandler = open('current_grammar.obj','w') dump(g, filehandler) - - if EVAL: - import pprint - print "underproposed:" - pprint.pprint(E.underproposed) - print "overproposed:" - pprint.pprint(E.overproposed) - + filehandler.close() + return g def corpus_likelihood(g, tagsonly): @@ -95,16 +89,20 @@ def test_likelihood(reestimate, initialize, inner_sent, reader = WSJDepCorpusReader(None) tagonlys = reader.tagonly_sents()[corpus_offset:corpus_offset+corpus_size] tags_and_parses = reader.tagged_and_parsed_sents()[corpus_offset:corpus_offset+corpus_size] - -# from loc_h_dmv import testcorpus -# tagonlys = testcorpus print "\ninitializing %d sentences..." % corpus_size, g = initialize(tagonlys) print "initialized" g = run_IO(g, iterations, tagonlys, tags_and_parses) # make iterations argument, todo - + + if EVAL: + import pprint + print "underproposed:" + pprint.pprint(g.E.underproposed) + print "overproposed:" + pprint.pprint(g.E.overproposed) + return g @@ -117,6 +115,7 @@ class Evaluation(): self.R[nd], self.R_r[nd], self.P[nd], self.P_r[nd] = 0, 0, 0, 0 self.unrooted = 0 # parses where we couldn't add_root + self.double_heads = 0 # parses w/ two heads to one argument self._precision, self._recall, self._precision_r, self._recall_r = 0.0, 0.0, 0.0, 0.0 self._F1, self._F1_r = 0.0, 0.0 @@ -142,15 +141,15 @@ class Evaluation(): P_rden = self.P['den']+self.P_r['den'] str_vals = (self.P['num'],self.P['den'],self._precision, P_rnum,P_rden,self._precision_r, self.R['num'],self.R['den'],self._recall, R_rnum,R_rden,self._recall_r, - self._F1, self._F1_r, self.unrooted) + self._F1, self._F1_r, self.unrooted, self.double_heads) regular_str = '''P: %5d/%5d = %s | P_r: %5d/%5d = %s R: %5d/%5d = %s | R_r: %5d/%5d = %s -F1: %s | F1_r: %s (unrooted gold parses: %d)'''%str_vals +F1: %s | F1_r: %s (unrooted gold parses: %d, double-headed: %d)'''%str_vals tex_str_vals = tuple([p * 100 for p in (self._precision,self._precision_r,self._recall,self._recall_r,self._F1,self._F1_r)]) tex_str = "$C_A=; C_S=;C_N=;C_M=$ & %.1f (%.1f) & %.1f (%.1f) & %.1f (%.1f) \\"%tex_str_vals - return tex_str # todo + return tex_str # todo make variable @@ -168,9 +167,12 @@ def evaluate(g, tagged_and_parsed_sents): E = Evaluation() for sent, gold_parse in tagged_and_parsed_sents: + if len(sent)-1 != len(gold_parse): + E.double_heads += 1 + continue mpp_sent = mpp(g, sent) try: gold_parse = add_root(gold_parse) - except ValueError: E.unrooted += 1 + except RuntimeError: E.unrooted += 1 for pair in gold_parse: dict = E.R @@ -274,6 +276,7 @@ def rnd_grammars_test(): H,S,N,M) for g,H,S,N,M in rnd_grammars1] + if __name__ == "__main__": print "main.py:" @@ -287,7 +290,7 @@ if __name__ == "__main__": initialize_loc_h, loc_h_dmv.inner_sent, corpus_size=6268, - iterations=50, + iterations=30, corpus_offset=0, EVAL=True) print g diff --git a/src/wsjdep.py b/src/wsjdep.py index 8dd59bf..1f66590 100644 --- a/src/wsjdep.py +++ b/src/wsjdep.py @@ -161,12 +161,12 @@ def add_root(parse): for (head,loc_h) in set([h for h,a in parse]): if (head,loc_h) not in set([a for h,a in parse]): if rooted: - raise ValueError, "Several possible roots in parse: %s"%(list(parse),) + raise RuntimeError, "Several possible roots in parse: %s"%(list(parse),) else: rooted = (head,loc_h) if not rooted: - raise ValueError, "No root in parse!" + raise RuntimeError, "No root in parse!" rootpair = (MPPROOT, rooted) return parse.union(set([ rootpair ])) -- 2.11.4.GIT