1 /* ====================================================================
2 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
17 * This work was supported in part by funding from the Defense Advanced
18 * Research Projects Agency and the National Science Foundation of the
19 * United States of America, and the CMU Sphinx Speech Consortium.
21 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * ====================================================================
39 * lm_3g.c -- Darpa Trigram LM module.
44 * Revision 1.14 2005/09/01 21:09:54 dhdfu
45 * Really, actually, truly consolidate byteswapping operations into
46 * byteorder.h. Where unconditional byteswapping is needed, SWAP_INT32()
47 * and SWAP_INT16() are to be used. The WORDS_BIGENDIAN macro from
48 * autoconf controls the functioning of the conditional swap macros
49 * (SWAP_?[LW]) whose names and semantics have been regularized.
50 * Private, adhoc macros have been removed.
52 * Revision 1.13 2004/12/10 16:48:56 rkm
53 * Added continuous density acoustic model handling
55 * Revision 1.12 2004/07/16 00:57:11 egouvea
56 * Added Ravi's implementation of FSG support.
58 * Revision 1.1.1.1 2004/03/01 14:30:29 rkm
61 * Revision 1.2 2004/02/27 16:15:13 rkm
64 * Revision 1.1.1.1 2003/12/03 20:05:04 rkm
65 * Initial CVS repository
67 * Revision 1.11 2001/12/11 00:24:48 lenzo
68 * Acknowledgement in License.
70 * Revision 1.10 2001/12/07 17:30:02 lenzo
71 * Clean up and remove extra lines.
73 * Revision 1.9 2001/12/07 13:11:30 lenzo
74 * Consolidate byte order code into byteorder.h. Note that there are still
75 * two "senses" of byte swapping that are confusing, and all this should
76 * be cleaned up and verified.
78 * Revision 1.8 2001/12/07 05:09:30 lenzo
81 * Revision 1.7 2001/12/07 04:27:35 lenzo
82 * License cleanup. Remove conditions on the names. Rationale: These
83 * conditions don't belong in the license itself, but in other fora that
84 * offer protection for recognizeable names such as "Carnegie Mellon
85 * University" and "Sphinx." These changes also reduce interoperability
86 * issues with other licenses such as the Mozilla Public License and the
87 * GPL. This update changes the top-level license files and removes the
88 * old license conditions from each of the files that contained it.
89 * All files in this collection fall under the copyright of the top-level
92 * Revision 1.6 2001/10/23 22:20:30 lenzo
93 * Change error logging and reporting to the E_* macros that call common
94 * functions. This will obsolete logmsg.[ch] and they will be removed
95 * or changed in future versions.
97 * Revision 1.5 2001/02/13 19:51:38 lenzo
98 * *** empty log message ***
100 * Revision 1.4 2001/02/13 18:50:35 lenzo
101 * Adding some more comments for a Solaris port.
103 * Revision 1.3 2000/12/12 23:01:42 lenzo
104 * Rationalizing libs and names some more. Split a/d and fe libs out.
106 * Revision 1.2 2000/12/05 01:45:12 lenzo
107 * Restructuring, hear rationalization, warning removal, ANSIfy
109 * Revision 1.1.1.1 2000/01/28 22:08:51 lenzo
110 * Initial import of sphinx2
113 * 28-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
114 * Added lm3g_access_type() and necessary support.
116 * 15-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
117 * Bugfix: inclass_ugscore[lw3] changed to inclass_ugscore[w3] in
118 * lm3g_tg_score(). (Thanks to dbansal@cs.)
120 * 15-Jul-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
121 * Corrected references to unigram_t.wid to unigram_t.mapid.
123 * 14-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
124 * Added lm3g_n_lm() and lm3g_index2name().
126 * 03-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University
127 * Added lm3g_raw_score() and lm_t.invlw.
128 * Changed a number of function names from lm_... to lm3g_...
130 * 09-Jan-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University
131 * BUGFIX: Added check for lmp->unigrams[i].wid in lm_set_current().
133 * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
134 * Changed function name lmname_to_lm() to lm_name2lm().
136 * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
137 * Changed function name get_current_lm to lm_get_current.
138 * Changed check for already existing word in lm_add_word, and added
139 * condition to updating dictwid_map.
141 * 01-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
142 * Removed LM cache and replaced with find_bg and find_tg within the main
143 * bigrams and trigram structures. No loss of speed and uses less memory.
145 * 24-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
146 * Fixed a number of memory leaks while deleting an LM. Added the global
147 * dictwid_map, and allocated it once and never freed. Made sure lm_cache
148 * is created only once.
150 * 14-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
151 * Modified lm_read to return 0 on success, and to delete any existing LM
152 * with the new LM name (instead of reporting error and exiting).
153 * Added backslash option in building filenames (for PC compatibility).
155 * Revision 8.9 94/10/11 12:36:28 rkm
156 * Changed lm_tg_score to call lm_bg_score if no trigrams present or
157 * the first word is invalid.
159 * Revision 8.8 94/07/29 11:54:23 rkm
160 * Renamed lmSetParameters to lm_set_param and moved it into lm_add().
161 * Added functions lm_init_oov() to create initial list of OOVs,
162 * and lm_add_word() to add new OOV at run time.
164 * Revision 8.7 94/05/19 14:19:59 rkm
165 * Rewrote LM cache code for greater efficiency.
167 * Revision 8.6 94/05/10 10:47:58 rkm
168 * Added lm_add() and lm_set_param() functions, for dynamically adding a new
169 * in-memory LM to the set of available LMs.
171 * Revision 8.5 94/04/22 13:53:27 rkm
172 * Added query_lm_cache_lines() to allow run-time spec of #cache lines.
174 * Revision 8.4 94/04/14 15:08:46 rkm
175 * Added function lm_delete() to delete a named LM and reclaim space.
177 * Revision 8.3 94/04/14 14:40:27 rkm
180 * Revision 8.1 94/02/15 15:09:22 rkm
181 * Derived from v7. Includes multiple LMs for grammar switching.
183 * Revision 6.13 94/02/11 13:14:45 rkm
184 * Added bigram and trigram multi-line caches, and functions, for v7.
185 * Replaced sequential search in lm3g_wstr2wid() with hash_lookup().
187 * Revision 6.12 94/01/07 10:56:16 rkm
188 * Corrected bug relating to input file format.
190 * Revision 6.11 93/12/17 13:14:52 rkm
191 * *** empty log message ***
193 * Revision 6.10 93/12/03 17:09:59 rkm
194 * Added ability to handle bigram-only dump files.
195 * Added <s> </s> bigram -> MIN_PROB.
196 * Added timestamp to dump files.
198 * Revision 6.9 93/12/01 12:29:55 rkm
199 * Added ability to handle LM files containing only bigrams.
200 * Excluded start_sym from interpolation of unigram prob with uniform prob.
203 * 93/10/21 rkm@cs.cmu.edu
206 * Revision 6.6 93/10/19 18:58:10 rkm
207 * Added code to change bigram-prob(<s>,<s>) to very low value. The
208 * Darpa LM file contains a spurious value to be ignored.
209 * Fixed bug that dumps one trigram entry too many.
211 * Revision 6.5 93/10/15 15:00:14 rkm
213 * Revision 6.4 93/10/13 16:56:04 rkm
214 * Added LM cache line stats.
215 * Added bg_only option for lm_read parameter list
216 * (but not yet implemented).
217 * Changed proc name ilm_LOG_prob_of to lm3g_prob, to avoid conflict
218 * with Roni's ILM function of the same name.
220 * Revision 6.3 93/10/09 17:01:55 rkm
221 * M K Ravishankar (rkm@cs) at Carnegie Mellon
222 * Cleaned up handling precompiled binary 3g LM file,
223 * Added SWAP option for HP platforms.
225 * Revision 6.2 93/10/06 11:08:15 rkm
226 * M K Ravishankar (rkm@cs) at Carnegie Mellon University
227 * Darpa Trigram LM module. Created.
233 #include <sys/types.h>
234 #include <sys/stat.h>
236 #define QUIT(x) {fflush(stdout); fprintf x; exit(-1);}
239 #include "CM_macros.h"
241 #include "strfuncs.h"
242 #include "linklist.h"
251 #define UG_MAPID(m,u) ((m)->unigrams[u].mapid)
252 #define UG_PROB_F(m,u) ((m)->unigrams[u].prob1.f)
253 #define UG_BO_WT_F(m,u) ((m)->unigrams[u].bo_wt1.f)
254 #define UG_PROB_L(m,u) ((m)->unigrams[u].prob1.l)
255 #define UG_BO_WT_L(m,u) ((m)->unigrams[u].bo_wt1.l)
256 #define FIRST_BG(m,u) ((m)->unigrams[u].bigrams)
257 #define LAST_BG(m,u) (FIRST_BG((m),(u)+1)-1)
259 #define BG_WID(m,b) ((m)->bigrams[b].wid)
260 #define BG_PROB_F(m,b) ((m)->prob2[(m)->bigrams[b].prob2].f)
261 #define BG_BO_WT_F(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].f)
262 #define BG_PROB_L(m,b) ((m)->prob2[(m)->bigrams[b].prob2].l)
263 #define BG_BO_WT_L(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].l)
264 #define TSEG_BASE(m,b) ((m)->tseg_base[(b)>>LOG_BG_SEG_SZ])
265 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->bigrams[b].trigrams))
266 #define LAST_TG(m,b) (FIRST_TG((m),(b)+1)-1)
268 #define TG_WID(m,t) ((m)->trigrams[t].wid)
269 #define TG_PROB_F(m,t) ((m)->prob3[(m)->trigrams[t].prob3].f)
270 #define TG_PROB_L(m,t) ((m)->prob3[(m)->trigrams[t].prob3].l)
272 /* FIXME: put this in a header file */
273 extern void quit (int status
, char const *fmt
, ...);
275 static char const *start_sym
= "<s>";
276 static char const *end_sym
= "</s>";
280 #define MIN_PROB_F -99.0
282 #define MAX_SORTED_ENTRIES 65534
284 /* Base values for ranges of unigram_t.mapid */
285 #define LM_DICTWID_BASE 0 /* Do not change this */
286 #define LM_CLASSID_BASE 0x01000000 /* Interpreted as LMclass ID */
287 #define LM_DICTWID_BADMAP -16000 /* An illegal mapping */
292 * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed
293 * to replace spurious values in the Darpa LM file.
295 static void init_sorted_list (sorted_list_t
*l
)
298 (sorted_entry_t
*) CM_calloc (MAX_SORTED_ENTRIES
, sizeof (sorted_entry_t
));
299 l
->list
[0].val
.f
= MIN_PROB_F
;
300 l
->list
[0].lower
= 0;
301 l
->list
[0].higher
= 0;
305 static void free_sorted_list (sorted_list_t
*l
)
310 static log_t
*vals_in_sorted_list (sorted_list_t
*l
)
315 vals
= (log_t
*) CM_calloc (l
->free
, sizeof (log_t
));
316 for (i
= 0; i
< l
->free
; i
++)
317 vals
[i
].f
= l
->list
[i
].val
.f
;
321 static int32
sorted_id (sorted_list_t
*l
, float *val
)
326 if (*val
== l
->list
[i
].val
.f
)
328 if (*val
< l
->list
[i
].val
.f
) {
329 if (l
->list
[i
].lower
== 0) {
330 if (l
->free
>= MAX_SORTED_ENTRIES
)
331 E_FATAL("sorted list overflow\n");
333 l
->list
[i
].lower
= l
->free
;
335 i
= l
->list
[i
].lower
;
336 l
->list
[i
].val
.f
= *val
;
339 i
= l
->list
[i
].lower
;
341 if (l
->list
[i
].higher
== 0) {
342 if (l
->free
>= MAX_SORTED_ENTRIES
)
343 E_FATAL("sorted list overflow\n");
345 l
->list
[i
].higher
= l
->free
;
347 i
= l
->list
[i
].higher
;
348 l
->list
[i
].val
.f
= *val
;
351 i
= l
->list
[i
].higher
;
357 * allocate, initialize and return pointer to an array of unigram entries.
359 static unigram_t
*NewUnigramTable (int32 n_ug
)
364 table
= (unigram_t
*) CM_calloc (n_ug
, sizeof (unigram_t
));
365 for (i
= 0; i
< n_ug
; i
++) {
366 table
[i
].mapid
= NO_WORD
;
367 table
[i
].prob1
.f
= -99.0;
368 table
[i
].bo_wt1
.f
= -99.0;
374 * returns a pointer to a new language model record. The size is passed in
378 NewModel (n_ug
, n_bg
, n_tg
)
385 model
= (lm_t
*) CM_calloc (1, sizeof (lm_t
));
388 * Allocate one extra unigram and bigram entry: sentinels to terminate
389 * followers (bigrams and trigrams, respectively) of previous entry.
391 model
->unigrams
= NewUnigramTable (n_ug
+1);
392 model
->bigrams
= (bigram_t
*) CM_calloc (n_bg
+1, sizeof (bigram_t
));
394 model
->trigrams
= (trigram_t
*) CM_calloc (n_tg
, sizeof (trigram_t
));
397 model
->tseg_base
= (int32
*) CM_calloc ((n_bg
+1)/BG_SEG_SZ
+1, sizeof (int32
));
399 E_INFO("%8d = tseg_base entries allocated\n",
400 (n_bg
+1)/BG_SEG_SZ
+1);
404 model
->max_ucount
= model
->ucount
= n_ug
;
405 model
->bcount
= n_bg
;
406 model
->tcount
= n_tg
;
410 model
->HT
.tab
= NULL
;
415 int32
lm3g_wstr2wid (lm_t
*model
, const char *w
)
419 if (hash_lookup (&(model
->HT
), w
, &val
) != 0)
421 return ((int32
) val
);
425 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
427 static void ReadNgramCounts (FILE *fp
,
428 int32
*n_ug
, int32
*n_bg
, int32
*n_tg
)
431 int32 ngram
, ngram_cnt
;
433 /* skip file until past the '\data\' marker */
435 fgets (string
, sizeof (string
), fp
);
436 while ( (strcmp (string
, "\\data\\\n") != 0) && (! feof (fp
)) );
438 if (strcmp (string
, "\\data\\\n") != 0)
439 E_FATAL("No \\data\\ mark in LM file\n");
441 *n_ug
= *n_bg
= *n_tg
= 0;
442 while (fgets (string
, sizeof (string
), fp
) != NULL
) {
443 if (sscanf (string
, "ngram %d=%d", &ngram
, &ngram_cnt
) != 2)
446 case 1: *n_ug
= ngram_cnt
;
448 case 2: *n_bg
= ngram_cnt
;
450 case 3: *n_tg
= ngram_cnt
;
453 E_FATAL("Unknown ngram (%d)\n", ngram
);
458 /* Position file to just after the unigrams header '\1-grams:\' */
459 while ( (strcmp (string
, "\\1-grams:\n") != 0) && (! feof (fp
)) )
460 fgets (string
, sizeof (string
), fp
);
462 /* Check counts; NOTE: #trigrams *CAN* be 0 */
463 if ((*n_ug
<= 0) || (*n_bg
<= 0) || (*n_tg
< 0))
464 E_FATAL("Bad or missing ngram count\n");
468 * Read in the unigrams from given file into the LM structure model. On
469 * entry to this procedure, the file pointer is positioned just after the
470 * header line '\1-grams:'.
472 static void ReadUnigrams (FILE *fp
, lm_t
*model
)
479 E_INFO ("Reading unigrams\n");
482 while ((fgets (string
, sizeof(string
), fp
) != NULL
) &&
483 (strcmp (string
, "\\2-grams:\n") != 0))
485 if (sscanf (string
, "%f %s %f", &p1
, name
, &bo_wt
) != 3) {
486 if (string
[0] != '\n')
487 E_WARN ("Format error; unigram ignored:%s", string
);
491 if (wcnt
>= model
->ucount
)
492 E_FATAL("Too many unigrams\n");
494 /* Associate name with word id */
495 model
->word_str
[wcnt
] = (char *) salloc (name
);
496 hash_add (&(model
->HT
), model
->word_str
[wcnt
], (caddr_t
) (wcnt
+1));
497 model
->unigrams
[wcnt
].prob1
.f
= p1
;
498 model
->unigrams
[wcnt
].bo_wt1
.f
= bo_wt
;
500 model
->unigrams
[wcnt
].mapid
= wcnt
;
504 if (model
->ucount
!= wcnt
) {
505 E_WARN ("lm_t.ucount(%d) != #unigrams read(%d)\n",
506 model
->ucount
, wcnt
);
507 model
->ucount
= wcnt
;
512 * Read bigrams from given file into given model structure. File may be arpabo
513 * or arpabo-id format, depending on idfmt = 0 or 1.
515 static void ReadBigrams (FILE *fp
, lm_t
*model
, int32 idfmt
)
517 char string
[1024], word1
[256], word2
[256];
518 int32 w1
, w2
, prev_w1
, bgcount
, p
;
523 E_INFO ("Reading bigrams\n");
526 bgptr
= model
->bigrams
;
528 n_fld
= (model
->tcount
> 0) ? 4 : 3;
531 while (fgets (string
, sizeof(string
), fp
) != NULL
) {
533 n
= sscanf (string
, "%f %s %s %f", &p2
, word1
, word2
, &bo_wt
);
535 n
= sscanf (string
, "%f %d %d %f", &p2
, &w1
, &w2
, &bo_wt
);
537 if (string
[0] != '\n')
543 if ((w1
= lm3g_wstr2wid (model
, word1
)) == NO_WORD
)
544 E_FATAL("Unknown word: %s\n", word1
);
545 w1
--; // hash value is 1-base
546 if ((w2
= lm3g_wstr2wid (model
, word2
)) == NO_WORD
)
547 E_FATAL("Unknown word: %s\n", word2
);
548 w2
--; // hash value is 1-base
550 if ((w1
>= model
->ucount
) || (w2
>= model
->ucount
) || (w1
< 0) || (w2
< 0))
551 E_FATAL("Bad bigram: %s", string
);
554 /* HACK!! to quantize probs to 4 decimal digits */
560 if (bgcount
>= model
->bcount
)
561 E_FATAL("Too many bigrams\n");
564 bgptr
->prob2
= sorted_id (&model
->sorted_prob2
, &p2
);
565 if (model
->tcount
> 0)
566 bgptr
->bo_wt2
= sorted_id (&model
->sorted_bo_wt2
, &bo_wt
);
570 E_FATAL("Bigrams not in unigram order\n");
572 for (prev_w1
++; prev_w1
<= w1
; prev_w1
++)
573 model
->unigrams
[prev_w1
].bigrams
= bgcount
;
580 if ((bgcount
& 0x0000ffff) == 0) {
584 if ((strcmp (string
, "\\end\\\n") != 0) && (strcmp (string
, "\\3-grams:\n") != 0))
585 E_FATAL("Bad bigram: %s\n", string
);
587 for (prev_w1
++; prev_w1
<= model
->ucount
; prev_w1
++)
588 model
->unigrams
[prev_w1
].bigrams
= bgcount
;
592 * Very similar to ReadBigrams.
594 static void ReadTrigrams (FILE *fp
, lm_t
*model
, int32 idfmt
)
596 char string
[1024], word1
[256], word2
[256], word3
[256];
597 int32 i
, n
, w1
, w2
, w3
, prev_w1
, prev_w2
, tgcount
, prev_bg
, bg
, endbg
, p
;
598 int32 seg
, prev_seg
, prev_seg_lastbg
;
603 E_INFO ("Reading trigrams\n");
606 tgptr
= model
->trigrams
;
612 while (fgets (string
, sizeof(string
), fp
) != NULL
) {
614 n
= sscanf (string
, "%f %s %s %s", &p3
, word1
, word2
, word3
);
616 n
= sscanf (string
, "%f %d %d %d", &p3
, &w1
, &w2
, &w3
);
618 if (string
[0] != '\n')
624 if ((w1
= lm3g_wstr2wid (model
, word1
)) == NO_WORD
)
625 E_FATAL("Unknown word: %s\n", word1
);
626 w1
--; // hash value is 1-base
627 if ((w2
= lm3g_wstr2wid (model
, word2
)) == NO_WORD
)
628 E_FATAL("Unknown word: %s\n", word2
);
629 w2
--; // hash value is 1-base
630 if ((w3
= lm3g_wstr2wid (model
, word3
)) == NO_WORD
)
631 E_FATAL("Unknown word: %s\n", word3
);
632 w3
--; // hash value is 1-base
634 if ((w1
>= model
->ucount
) || (w2
>= model
->ucount
) || (w3
>= model
->ucount
) ||
635 (w1
< 0) || (w2
< 0) || (w3
< 0))
636 E_FATAL("Bad trigram: %s", string
);
639 /* HACK!! to quantize probs to 4 decimal digits */
643 if (tgcount
>= model
->tcount
)
644 E_FATAL("Too many trigrams\n");
647 tgptr
->prob3
= sorted_id (&model
->sorted_prob3
, &p3
);
649 if ((w1
!= prev_w1
) || (w2
!= prev_w2
)) {
650 /* Trigram for a new bigram; update tg info for all previous bigrams */
651 if ((w1
< prev_w1
) || ((w1
== prev_w1
) && (w2
< prev_w2
)))
652 E_FATAL("Trigrams not in bigram order\n");
654 bg
= (w1
!= prev_w1
) ? model
->unigrams
[w1
].bigrams
: prev_bg
+1;
655 endbg
= model
->unigrams
[w1
+1].bigrams
;
656 bgptr
= model
->bigrams
+ bg
;
657 for (; (bg
< endbg
) && (bgptr
->wid
!= w2
); bg
++, bgptr
++);
659 E_FATAL("Missing bigram for trigram: %s", string
);
661 /* bg = bigram entry index for <w1,w2>. Update tseg_base */
662 seg
= bg
>> LOG_BG_SEG_SZ
;
663 for (i
= prev_seg
+1; i
<= seg
; i
++)
664 model
->tseg_base
[i
] = tgcount
;
666 /* Update trigrams pointers for all bigrams until bg */
667 if (prev_seg
< seg
) {
671 tgoff
= tgcount
- model
->tseg_base
[prev_seg
];
673 E_FATAL("Offset from tseg_base > 65535\n");
676 prev_seg_lastbg
= ((prev_seg
+1) << LOG_BG_SEG_SZ
) - 1;
677 bgptr
= model
->bigrams
+ prev_bg
;
678 for (++prev_bg
, ++bgptr
; prev_bg
<= prev_seg_lastbg
; prev_bg
++, bgptr
++)
679 bgptr
->trigrams
= tgoff
;
681 for (; prev_bg
<= bg
; prev_bg
++, bgptr
++)
686 tgoff
= tgcount
- model
->tseg_base
[prev_seg
];
688 E_FATAL("Offset from tseg_base > 65535\n");
690 bgptr
= model
->bigrams
+ prev_bg
;
691 for (++prev_bg
, ++bgptr
; prev_bg
<= bg
; prev_bg
++, bgptr
++)
692 bgptr
->trigrams
= tgoff
;
704 if ((tgcount
& 0x0000ffff) == 0) {
708 if (strcmp (string
, "\\end\\\n") != 0)
709 E_FATAL("Bad trigram: %s\n", string
);
711 for (prev_bg
++; prev_bg
<= model
->bcount
; prev_bg
++) {
712 if ((prev_bg
& (BG_SEG_SZ
-1)) == 0)
713 model
->tseg_base
[prev_bg
>> LOG_BG_SEG_SZ
] = tgcount
;
714 if ((tgcount
- model
->tseg_base
[prev_bg
>> LOG_BG_SEG_SZ
]) > 65535)
715 E_FATAL("Offset from tseg_base > 65535\n");
716 model
->bigrams
[prev_bg
].trigrams
=
717 tgcount
- model
->tseg_base
[prev_bg
>> LOG_BG_SEG_SZ
];
721 static FILE *lm_file_open (char const *filename
, int32 usepipe
)
728 sprintf (command
, "D:\\compress\\gzip.exe -d -c %s", filename
);
729 if ((fp
= _popen (command
, "r")) == NULL
)
730 E_FATAL("Cannot popen %s\n", command
);
732 sprintf (command
, "zcat %s", filename
);
733 if ((fp
= popen (command
, "r")) == NULL
)
734 E_FATAL("Cannot popen %s\n", command
);
738 fp
= CM_fopen (filename
, "r");
744 * Read in a trigram language model from the given file. The LM tokens can be word
745 * classes. However a given actual word can belong to AT MOST ONE of the LM classes
748 lm_t
* lm3g_read (char const *filename
)
752 int32 usingPipe
= FALSE
;
757 int32 i
, j
, k
, last_bg
, last_tg
;
761 E_INFO ("Reading LM file %s\n", filename
);
763 /* Check if a compressed file */
764 k
= strlen(filename
);
766 usingPipe
= (k
> 3) &&
767 ((strcmp (filename
+k
-3, ".gz") == 0) || (strcmp (filename
+k
-3, ".GZ") == 0));
769 usingPipe
= (k
> 2) &&
770 ((strcmp (filename
+k
-2, ".Z") == 0) || (strcmp (filename
+k
-2, ".z") == 0));
772 /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */
775 idfmt
= ((k
> 3) && (strncmp (filename
+k
-3, "-id", 3) == 0));
777 fp
= lm_file_open (filename
, usingPipe
);
778 if (stat(filename
, &statbuf
) < 0)
779 E_FATAL("stat(%s) failed\n", filename
);
781 /* Read #unigrams, #bigrams, #trigrams from file */
782 ReadNgramCounts (fp
, &n_unigram
, &n_bigram
, &n_trigram
);
783 E_INFO ("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram
, n_bigram
, n_trigram
);
785 /* Determine dictionary size (for dict-wid -> LM-wid map) */
786 dict_size
= n_unigram
;
788 if (dict_size
>= 65535)
789 E_FATAL("#dict-words(%d) > 65534\n", dict_size
);
791 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
792 model
= NewModel (n_unigram
, n_bigram
, n_trigram
);
793 model
->word_str
= (char **) CM_calloc (n_unigram
, sizeof (char *));
794 model
->n_word_str
= n_unigram
;
796 /* Load the precompiled binary dump form of the Darpa LM file if it exists */
798 ReadUnigrams (fp
, model
);
799 E_INFO("%8d = #unigrams created\n", model
->ucount
);
801 init_sorted_list (&model
->sorted_prob2
);
802 if (model
->tcount
> 0)
803 init_sorted_list (&model
->sorted_bo_wt2
);
805 ReadBigrams (fp
, model
, idfmt
);
807 model
->bcount
= FIRST_BG(model
,model
->ucount
);
808 model
->n_prob2
= model
->sorted_prob2
.free
;
809 model
->prob2
= vals_in_sorted_list (&model
->sorted_prob2
);
810 free_sorted_list (&model
->sorted_prob2
);
811 E_INFO("\n%8d = #bigrams created\n", model
->bcount
);
812 E_INFO("%8d = #prob2 entries\n", model
->n_prob2
);
814 if (model
->tcount
> 0) {
815 /* Create trigram bo-wts array */
816 model
->n_bo_wt2
= model
->sorted_bo_wt2
.free
;
817 model
->bo_wt2
= vals_in_sorted_list (&model
->sorted_bo_wt2
);
818 free_sorted_list (&model
->sorted_bo_wt2
);
819 E_INFO("%8d = #bo_wt2 entries\n", model
->n_bo_wt2
);
821 init_sorted_list (&model
->sorted_prob3
);
823 ReadTrigrams (fp
, model
, idfmt
);
825 model
->tcount
= FIRST_TG(model
,model
->bcount
);
826 model
->n_prob3
= model
->sorted_prob3
.free
;
827 model
->prob3
= vals_in_sorted_list (&model
->sorted_prob3
);
828 E_INFO("\n%8d = #trigrams created\n", model
->tcount
);
829 E_INFO("%8d = #prob3 entries\n", model
->n_prob3
);
831 free_sorted_list (&model
->sorted_prob3
);
838 * Discourage expansion of end_sym and transition to start_sym. (The given
839 * Darpa LM may contain some spurious values that don't reflect these
842 /* bo_wt(</s>) = MIN_PROB_F */
843 for (i
= 0; (i
< model
->ucount
) && (strcmp (model
->word_str
[i
], end_sym
) != 0); i
++);
844 E_INFO("bo_wt(%s) changed from %.4f to %.4f\n",
845 model
->word_str
[i
], model
->unigrams
[i
].bo_wt1
.f
, MIN_PROB_F
);
846 model
->unigrams
[i
].bo_wt1
.f
= MIN_PROB_F
;
848 /* unigram prob(<s>) = MIN_PROB_F */
849 for (i
= 0; (i
< model
->ucount
) && (strcmp (model
->word_str
[i
], start_sym
) != 0); i
++);
850 E_INFO("prob(%s) changed from %.4f to %.4f\n",
851 model
->word_str
[i
], model
->unigrams
[i
].prob1
.f
, MIN_PROB_F
);
852 model
->unigrams
[i
].prob1
.f
= MIN_PROB_F
;
854 /* bigram prob(<s>,<s>) = MIN_PROB_F (if bigram exists) */
855 j
= FIRST_BG(model
,i
);
856 last_bg
= LAST_BG(model
,i
);
857 for (; (j
<=last_bg
) && (strcmp(model
->word_str
[BG_WID(model
,j
)],start_sym
)!=0); j
++);
859 E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
860 model
->word_str
[i
], model
->word_str
[BG_WID(model
,j
)],
861 model
->prob2
[model
->bigrams
[j
].prob2
].f
,
863 model
->bigrams
[j
].prob2
= 0;
865 if (model
->tcount
> 0) {
866 /* trigram prob(<s>,<s>,<s>) = MIN_PROB_F (if trigram exists) */
867 k
= FIRST_TG(model
,j
);
868 last_tg
= LAST_TG(model
,j
);
869 for (; k
<= last_tg
; k
++) {
870 if (strcmp (model
->word_str
[TG_WID(model
,k
)], start_sym
) == 0)
874 E_INFO("prob(%s,%s,%s) changed from %.4f to %.4f\n",
875 model
->word_str
[i
], model
->word_str
[BG_WID(model
,j
)], model
->word_str
[TG_WID(model
,k
)],
876 model
->prob3
[model
->trigrams
[k
].prob3
].f
,
878 model
->trigrams
[k
].prob3
= 0;
883 /* bigram prob(<s>,</s>) = MIN_PROB_F (if bigram exists) */
884 j
= FIRST_BG(model
,i
);
885 last_bg
= LAST_BG(model
,i
);
886 for (; (j
<=last_bg
) && (strcmp(model
->word_str
[BG_WID(model
,j
)],end_sym
)!=0); j
++);
888 E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
889 model
->word_str
[i
], model
->word_str
[BG_WID(model
,j
)],
890 model
->prob2
[model
->bigrams
[j
].prob2
].f
,
892 model
->bigrams
[j
].prob2
= 0;
898 void lm3g_free (lm_t
*model
)
901 tginfo_t
*tginfo
, *next_tginfo
;
903 hash_free (&(model
->HT
));
904 for (i
= 0; i
< model
->ucount
; i
++)
905 free (model
->word_str
[i
]);
906 free (model
->word_str
);
908 free (model
->unigrams
);
909 free (model
->bigrams
);
911 if (model
->tcount
> 0) {
912 free (model
->trigrams
);
913 free (model
->tseg_base
);
914 free (model
->bo_wt2
);
917 if (model
->HT
.tab
!= NULL
)
918 hash_free (&model
->HT
);
921 for (u
= 0; u
< model
->max_ucount
; u
++)
922 for (tginfo
= model
->tginfo
[u
]; tginfo
; tginfo
= next_tginfo
) {
923 next_tginfo
= tginfo
->next
;
924 listelem_free ((void *)tginfo
, sizeof(tginfo_t
));
926 free (model
->tginfo
);
932 void lmSetStartSym (char const *sym
)
933 /*----------------------------*
934 * Description - reconfigure the start symbol
937 start_sym
= (char *) salloc(sym
);
940 void lmSetEndSym (char const *sym
)
941 /*----------------------------*
942 * Description - reconfigure the end symbol
945 end_sym
= (char *) salloc(sym
);
956 char wd
[3][100], line
[1000];
961 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv
[0]);
962 if (sscanf(argv
[1], "%f", &lw
) != 1)
963 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv
[0]);
964 if (sscanf(argv
[2], "%f", &uw
) != 1)
965 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv
[0]);
966 if (sscanf(argv
[3], "%f", &wip
) != 1)
967 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv
[0]);
969 lm_read(argv
[4], lw
, uw
, wip
, 1.0);
971 lm_read("/net/alf8/cdrom/wsj1/grammar/tgboc20o.nvp", lw
, uw
, wip
, 1.0);
973 lmSetParameters (model
, 0);
975 unkwid
= lm3g_wstr2wid (model
, "<UNK>");
976 unkwid
--; // hash value is 1-base
978 printf ("Enter 1, 2, or 3 words: ");
979 fgets (line
, sizeof(line
), stdin
);
980 if (((n
= sscanf (line
, "%s %s %s", wd
[0], wd
[1], wd
[2])) < 1) || (n
> 3))
982 for (i
= 0; i
< n
; i
++)
983 if ((wid
[i
] = lm3g_wstr2wid (model
, wd
[i
])) == NO_WORD
) {
984 printf (" %s -> <UNK>\n", wd
[i
]);
992 score
= unigram_score (wid
[0]);
993 printf ("unigram_score(%d) = %d\n",
997 score
= bigram_score (wid
[0], wid
[1]);
998 printf ("bigram_score(%d, %d) = %d\n",
999 wid
[0], wid
[1], score
);
1002 score
= trigram_score (wid
[0], wid
[1], wid
[2]);
1003 printf ("trigram_score(%d, %d, %d) = %d\n",
1004 wid
[0], wid
[1], wid
[2], score
);
1013 #define BINARY_SEARCH_THRESH 16
1015 float lm3g_ug_score (lm_t
*lm
, int32 wid
)
1017 if (wid
>= lm
->ucount
)
1018 E_FATAL("dictwid[%d] not in LM\n", wid
);
1020 return (lm
->unigrams
[wid
].prob1
.f
);
1023 /* Locate a specific bigram within a bigram list */
1024 static int32
find_bg (bigram_t
*bg
, int32 n
, int32 w
)
1028 /* Binary search until segment size < threshold */
1031 while (e
-b
> BINARY_SEARCH_THRESH
) {
1035 else if (bg
[i
].wid
> w
)
1041 /* Linear search within narrowed segment */
1042 for (i
= b
; (i
< e
) && (bg
[i
].wid
!= w
); i
++);
1043 return ((i
< e
) ? i
: -1);
1046 /* w1, w2 are dictionary (base-)word ids */
1047 float lm3g_bg_score (lm_t
*lm
, int32 w1
, int32 w2
)
1053 /* lm->n_bg_score++; */
1055 if (w1
>= lm
->ucount
)
1056 E_FATAL("dictwid[%d] not in LM\n", w1
);
1057 if (w2
>= lm
->ucount
)
1058 E_FATAL("dictwid[%d] not in LM\n", w2
);
1060 b
= FIRST_BG(lm
, w1
);
1061 n
= FIRST_BG(lm
, w1
+1) - b
;
1062 bg
= lm
->bigrams
+ b
;
1064 if ((i
= find_bg (bg
, n
, w2
)) >= 0) {
1065 score
= lm
->prob2
[bg
[i
].prob2
].f
;
1067 /* lm->n_bg_bo++; */
1068 score
= lm
->unigrams
[w1
].bo_wt1
.f
+ lm
->unigrams
[w2
].prob1
.f
;
1074 static void load_tginfo (lm_t
*lm
, int32 lw1
, int32 lw2
)
1080 /* First allocate space for tg information for bg lw1,lw2 */
1081 tginfo
= (tginfo_t
*) listelem_alloc (sizeof(tginfo_t
));
1084 tginfo
->next
= lm
->tginfo
[lw2
];
1085 lm
->tginfo
[lw2
] = tginfo
;
1087 /* Locate bigram lw1,lw2 */
1089 b
= lm
->unigrams
[lw1
].bigrams
;
1090 n
= lm
->unigrams
[lw1
+1].bigrams
- b
;
1091 bg
= lm
->bigrams
+ b
;
1093 if ((n
> 0) && ((i
= find_bg (bg
, n
, lw2
)) >= 0)) {
1094 tginfo
->bowt
= lm
->bo_wt2
[bg
[i
].bo_wt2
].f
;
1096 /* Find t = Absolute first trigram index for bigram lw1,lw2 */
1097 b
+= i
; /* b = Absolute index of bigram lw1,lw2 on disk */
1098 t
= FIRST_TG(lm
, b
);
1100 tginfo
->tg
= lm
->trigrams
+ t
;
1102 /* Find #tg for bigram w1,w2 */
1103 tginfo
->n_tg
= FIRST_TG(lm
, b
+1) - t
;
1104 } else { /* No bigram w1,w2 */
1110 /* Similar to find_bg */
1111 static int32
find_tg (trigram_t
*tg
, int32 n
, int32 w
)
1117 while (e
-b
> BINARY_SEARCH_THRESH
) {
1121 else if (tg
[i
].wid
> w
)
1127 for (i
= b
; (i
< e
) && (tg
[i
].wid
!= w
); i
++);
1128 return ((i
< e
) ? i
: -1);
1131 /* w1, w2, w3 are dictionary wids */
1132 float lm3g_tg_score (lm_t
*lm
, int32 w1
, int32 w2
, int32 w3
)
1137 tginfo_t
*tginfo
, *prev_tginfo
;
1139 if ((lm
->tcount
<= 0) || (w1
< 0))
1140 return (lm3g_bg_score (lm
, w2
, w3
));
1142 /* lm->n_tg_score++; */
1144 if (w1
>= lm
->ucount
)
1145 E_FATAL("dictwid[%d] not in LM\n", w1
);
1146 if (w2
>= lm
->ucount
)
1147 E_FATAL("dictwid[%d] not in LM\n", w2
);
1148 if (w3
>= lm
->ucount
)
1149 E_FATAL("dictwid[%d] not in LM\n", w3
);
1152 for (tginfo
= lm
->tginfo
[w2
]; tginfo
; tginfo
= tginfo
->next
) {
1153 if (tginfo
->w1
== w1
)
1155 prev_tginfo
= tginfo
;
1159 load_tginfo (lm
, w1
, w2
);
1160 tginfo
= lm
->tginfo
[w2
];
1161 } else if (prev_tginfo
) {
1162 prev_tginfo
->next
= tginfo
->next
;
1163 tginfo
->next
= lm
->tginfo
[w2
];
1164 lm
->tginfo
[w2
] = tginfo
;
1169 /* Trigrams for w1,w2 now pointed to by tginfo */
1172 if ((i
= find_tg (tg
, n
, w3
)) >= 0) {
1173 score
= lm
->prob3
[tg
[i
].prob3
].f
;
1175 /* lm->n_tg_bo++; */
1176 score
= tginfo
->bowt
+ lm3g_bg_score(lm
, w2
, w3
);
1182 int lm3g_word_str_size(lm_t
*lm
)
1184 return lm
->n_word_str
;