From 6c9527d43d56948d26889fe93a0c6f56a2416c28 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Wed, 13 Feb 2008 20:41:15 +0700 Subject: [PATCH] Rewrite train for easier maintainance --- utils/train | 56 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 16 deletions(-) rewrite utils/train (95%) diff --git a/utils/train b/utils/train dissimilarity index 95% index 3ecfa8c..64c21a5 100755 --- a/utils/train +++ b/utils/train @@ -1,16 +1,40 @@ -#!/bin/bash -O=$1 -i=$2 -[ "$i" = "0" ] || ii=${O}.arpa.$(($2-1)) -echo "sc-train --replay" -echo "gzip -d < ${O}.scz.gz|./sc-train --wordlist ${O}.vocab --replay $ii 2>${O}.log.sc-train.$i | gzip > ${O}.sc$i.gz" -time gzip -d < ${O}.scz.gz|./sc-train --wordlist ${O}.vocab --replay $ii 2>${O}.log.sc-train.$i | gzip > ${O}.sc$i.gz -echo sc2wngram -echo "gzip -d < ${O}.sc$i.gz | LANG=C sort -S64M -T $(dirname $O)| ./sc2wngram 4 |gzip > ${O}.wngram.$i.gz" -time gzip -d < ${O}.sc$i.gz | LANG=C sort -S64M -T $(dirname $O)| ./sc2wngram 4 |gzip > ${O}.wngram.$i.gz -echo wngram2idngram -echo "gzip -d < ${O}.wngram.$i.gz|../../CMU-Cam_Toolkit_v2/src/wngram2idngram -n 2 -temp $(dirname $O) -vocab ${O}.vocab2 > ${O}.idngram.$i 2>${O}.log.idngram.$i" -time gzip -d < ${O}.wngram.$i.gz|../../CMU-Cam_Toolkit_v2/src/wngram2idngram -n 2 -temp $(dirname $O) -vocab ${O}.vocab2 > ${O}.idngram.$i 2>${O}.log.idngram.$i -echo idngram2lm -echo "../../CMU-Cam_Toolkit_v2/src/idngram2lm -four_byte_counts -n 2 -vocab_type 0 -vocab ${O}.vocab2 -idngram ${O}.idngram.$i -arpa ${O}.arpa.$i -context ${O}.ccs -witten_bell 2>${O}.log.lm.$i" -../../CMU-Cam_Toolkit_v2/src/idngram2lm -four_byte_counts -n 2 -vocab_type 0 -vocab ${O}.vocab2 -idngram ${O}.idngram.$i -arpa ${O}.arpa.$i -context ${O}.ccs -witten_bell 2>${O}.log.lm.$i +#!/bin/bash +O=$1 +i=$2 +BACKOFF=-witten_bell +MEMSIZE=64M +POINTCUT=4 +[ "$i" = "0" ] || ii=${O}.arpa.$(($2-1)) +time gzip -d < ${O}.scz.gz | + ./sc-train \ + --wordlist ${O}.vocab \ + --replay $ii \ + 2>${O}.log.sc-train.$i | + LANG=C grep -v -F '<' | + gzip > ${O}.sc$i.gz + +time gzip -d < ${O}.sc$i.gz | + LANG=C sort \ + -S$MEMSIZE \ + -T $(dirname $O)| + ./sc2wngram $POINTCUT | + gzip > ${O}.wngram.$i.gz + +time gzip -d < ${O}.wngram.$i.gz| + ../../CMU-Cam_Toolkit_v2/src/wngram2idngram \ + -n 2 \ + -temp $(dirname $O) \ + -vocab ${O}.vocab2 \ + > ${O}.idngram.$i \ + 2>${O}.log.idngram.$i + +../../CMU-Cam_Toolkit_v2/src/idngram2lm \ + -four_byte_counts \ + -n 2 \ + -vocab_type 0 \ + -vocab ${O}.vocab2 \ + -idngram ${O}.idngram.$i \ + -arpa ${O}.arpa.$i \ + -context ${O}.ccs \ + $BACKOFF \ + 2>${O}.log.lm.$i -- 2.11.4.GIT