softcount: tolerate zero ngrams
[vspell.git] / utils / train
blob71753122dc1605236ad71ce672557fafa0167302
1 #!/bin/bash
2 O=$1
3 i=$2
4 BACKOFF=-witten_bell
5 MEMSIZE=64M
6 POINTCUT=4
7 [ "$i" = "0" ] || ii=${O}.arpa.$(($2-1))
8 time gzip -d < ${O}.scz.gz |
9 ./sc-train \
10 --wordlist ${O}.vocab \
11 --replay $ii \
12 2>${O}.log.sc-train.$i |
13 LANG=C grep -v -F '<' |
14 gzip > ${O}.sc$i.gz
16 time gzip -d < ${O}.sc$i.gz |
17 LANG=C sort \
18 --compress-program=gzip \
19 -S$MEMSIZE \
20 -T $(dirname $O)|
21 ./sc2wngram $POINTCUT |
22 gzip > ${O}.wngram.$i.gz
24 time gzip -d < ${O}.wngram.$i.gz|
25 ../../CMU-Cam_Toolkit_v2/src/wngram2idngram \
26 -n 2 \
27 -temp $(dirname $O) \
28 -vocab ${O}.vocab2 \
29 > ${O}.idngram.$i \
30 2>${O}.log.idngram.$i
32 ../../CMU-Cam_Toolkit_v2/src/idngram2lm \
33 -four_byte_counts \
34 -n 2 \
35 -vocab_type 0 \
36 -vocab ${O}.vocab2 \
37 -idngram ${O}.idngram.$i \
38 -arpa ${O}.arpa.$i \
39 -context ${O}.ccs \
40 $BACKOFF \
41 2>${O}.log.lm.$i