Rewrite train for easier maintainance
[vspell.git] / utils / train
blob64c21a5bc648c7f1fdea473e5330ab52c4ea98bf
1 #!/bin/bash
2 O=$1
3 i=$2
4 BACKOFF=-witten_bell
5 MEMSIZE=64M
6 POINTCUT=4
7 [ "$i" = "0" ] || ii=${O}.arpa.$(($2-1))
8 time gzip -d < ${O}.scz.gz |
9 ./sc-train \
10 --wordlist ${O}.vocab \
11 --replay $ii \
12 2>${O}.log.sc-train.$i |
13 LANG=C grep -v -F '<' |
14 gzip > ${O}.sc$i.gz
16 time gzip -d < ${O}.sc$i.gz |
17 LANG=C sort \
18 -S$MEMSIZE \
19 -T $(dirname $O)|
20 ./sc2wngram $POINTCUT |
21 gzip > ${O}.wngram.$i.gz
23 time gzip -d < ${O}.wngram.$i.gz|
24 ../../CMU-Cam_Toolkit_v2/src/wngram2idngram \
25 -n 2 \
26 -temp $(dirname $O) \
27 -vocab ${O}.vocab2 \
28 > ${O}.idngram.$i \
29 2>${O}.log.idngram.$i
31 ../../CMU-Cam_Toolkit_v2/src/idngram2lm \
32 -four_byte_counts \
33 -n 2 \
34 -vocab_type 0 \
35 -vocab ${O}.vocab2 \
36 -idngram ${O}.idngram.$i \
37 -arpa ${O}.arpa.$i \
38 -context ${O}.ccs \
39 $BACKOFF \
40 2>${O}.log.lm.$i