softcount: tolerate zero ngrams
[vspell.git] / libvspell / lm / lm_3g.c
blob830281c7a13ad5e56b83097e57c657f46e28d3e6
1 /* ====================================================================
2 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
3 * reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
17 * This work was supported in part by funding from the Defense Advanced
18 * Research Projects Agency and the National Science Foundation of the
19 * United States of America, and the CMU Sphinx Speech Consortium.
21 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
22 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
23 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
25 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 * ====================================================================
37 /*
39 * lm_3g.c -- Darpa Trigram LM module.
41 * HISTORY
43 * $Log: lm_3g.c,v $
44 * Revision 1.14 2005/09/01 21:09:54 dhdfu
45 * Really, actually, truly consolidate byteswapping operations into
46 * byteorder.h. Where unconditional byteswapping is needed, SWAP_INT32()
47 * and SWAP_INT16() are to be used. The WORDS_BIGENDIAN macro from
48 * autoconf controls the functioning of the conditional swap macros
49 * (SWAP_?[LW]) whose names and semantics have been regularized.
50 * Private, adhoc macros have been removed.
52 * Revision 1.13 2004/12/10 16:48:56 rkm
53 * Added continuous density acoustic model handling
55 * Revision 1.12 2004/07/16 00:57:11 egouvea
56 * Added Ravi's implementation of FSG support.
58 * Revision 1.1.1.1 2004/03/01 14:30:29 rkm
61 * Revision 1.2 2004/02/27 16:15:13 rkm
62 * Added FSG switching
64 * Revision 1.1.1.1 2003/12/03 20:05:04 rkm
65 * Initial CVS repository
67 * Revision 1.11 2001/12/11 00:24:48 lenzo
68 * Acknowledgement in License.
70 * Revision 1.10 2001/12/07 17:30:02 lenzo
71 * Clean up and remove extra lines.
73 * Revision 1.9 2001/12/07 13:11:30 lenzo
74 * Consolidate byte order code into byteorder.h. Note that there are still
75 * two "senses" of byte swapping that are confusing, and all this should
76 * be cleaned up and verified.
78 * Revision 1.8 2001/12/07 05:09:30 lenzo
79 * License.xsxc
81 * Revision 1.7 2001/12/07 04:27:35 lenzo
82 * License cleanup. Remove conditions on the names. Rationale: These
83 * conditions don't belong in the license itself, but in other fora that
84 * offer protection for recognizeable names such as "Carnegie Mellon
85 * University" and "Sphinx." These changes also reduce interoperability
86 * issues with other licenses such as the Mozilla Public License and the
87 * GPL. This update changes the top-level license files and removes the
88 * old license conditions from each of the files that contained it.
89 * All files in this collection fall under the copyright of the top-level
90 * LICENSE file.
92 * Revision 1.6 2001/10/23 22:20:30 lenzo
93 * Change error logging and reporting to the E_* macros that call common
94 * functions. This will obsolete logmsg.[ch] and they will be removed
95 * or changed in future versions.
97 * Revision 1.5 2001/02/13 19:51:38 lenzo
98 * *** empty log message ***
100 * Revision 1.4 2001/02/13 18:50:35 lenzo
101 * Adding some more comments for a Solaris port.
103 * Revision 1.3 2000/12/12 23:01:42 lenzo
104 * Rationalizing libs and names some more. Split a/d and fe libs out.
106 * Revision 1.2 2000/12/05 01:45:12 lenzo
107 * Restructuring, hear rationalization, warning removal, ANSIfy
109 * Revision 1.1.1.1 2000/01/28 22:08:51 lenzo
110 * Initial import of sphinx2
113 * 28-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
114 * Added lm3g_access_type() and necessary support.
116 * 15-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
117 * Bugfix: inclass_ugscore[lw3] changed to inclass_ugscore[w3] in
118 * lm3g_tg_score(). (Thanks to dbansal@cs.)
120 * 15-Jul-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
121 * Corrected references to unigram_t.wid to unigram_t.mapid.
123 * 14-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
124 * Added lm3g_n_lm() and lm3g_index2name().
126 * 03-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University
127 * Added lm3g_raw_score() and lm_t.invlw.
128 * Changed a number of function names from lm_... to lm3g_...
130 * 09-Jan-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University
131 * BUGFIX: Added check for lmp->unigrams[i].wid in lm_set_current().
133 * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
134 * Changed function name lmname_to_lm() to lm_name2lm().
136 * 06-Dec-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
137 * Changed function name get_current_lm to lm_get_current.
138 * Changed check for already existing word in lm_add_word, and added
139 * condition to updating dictwid_map.
141 * 01-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
142 * Removed LM cache and replaced with find_bg and find_tg within the main
143 * bigrams and trigram structures. No loss of speed and uses less memory.
145 * 24-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
146 * Fixed a number of memory leaks while deleting an LM. Added the global
147 * dictwid_map, and allocated it once and never freed. Made sure lm_cache
148 * is created only once.
150 * 14-Jun-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
151 * Modified lm_read to return 0 on success, and to delete any existing LM
152 * with the new LM name (instead of reporting error and exiting).
153 * Added backslash option in building filenames (for PC compatibility).
155 * Revision 8.9 94/10/11 12:36:28 rkm
156 * Changed lm_tg_score to call lm_bg_score if no trigrams present or
157 * the first word is invalid.
159 * Revision 8.8 94/07/29 11:54:23 rkm
160 * Renamed lmSetParameters to lm_set_param and moved it into lm_add().
161 * Added functions lm_init_oov() to create initial list of OOVs,
162 * and lm_add_word() to add new OOV at run time.
164 * Revision 8.7 94/05/19 14:19:59 rkm
165 * Rewrote LM cache code for greater efficiency.
167 * Revision 8.6 94/05/10 10:47:58 rkm
168 * Added lm_add() and lm_set_param() functions, for dynamically adding a new
169 * in-memory LM to the set of available LMs.
171 * Revision 8.5 94/04/22 13:53:27 rkm
172 * Added query_lm_cache_lines() to allow run-time spec of #cache lines.
174 * Revision 8.4 94/04/14 15:08:46 rkm
175 * Added function lm_delete() to delete a named LM and reclaim space.
177 * Revision 8.3 94/04/14 14:40:27 rkm
178 * Minor changes.
180 * Revision 8.1 94/02/15 15:09:22 rkm
181 * Derived from v7. Includes multiple LMs for grammar switching.
183 * Revision 6.13 94/02/11 13:14:45 rkm
184 * Added bigram and trigram multi-line caches, and functions, for v7.
185 * Replaced sequential search in lm3g_wstr2wid() with hash_lookup().
187 * Revision 6.12 94/01/07 10:56:16 rkm
188 * Corrected bug relating to input file format.
190 * Revision 6.11 93/12/17 13:14:52 rkm
191 * *** empty log message ***
193 * Revision 6.10 93/12/03 17:09:59 rkm
194 * Added ability to handle bigram-only dump files.
195 * Added <s> </s> bigram -> MIN_PROB.
196 * Added timestamp to dump files.
198 * Revision 6.9 93/12/01 12:29:55 rkm
199 * Added ability to handle LM files containing only bigrams.
200 * Excluded start_sym from interpolation of unigram prob with uniform prob.
203 * 93/10/21 rkm@cs.cmu.edu
204 * Added <c.h>
206 * Revision 6.6 93/10/19 18:58:10 rkm
207 * Added code to change bigram-prob(<s>,<s>) to very low value. The
208 * Darpa LM file contains a spurious value to be ignored.
209 * Fixed bug that dumps one trigram entry too many.
211 * Revision 6.5 93/10/15 15:00:14 rkm
213 * Revision 6.4 93/10/13 16:56:04 rkm
214 * Added LM cache line stats.
215 * Added bg_only option for lm_read parameter list
216 * (but not yet implemented).
217 * Changed proc name ilm_LOG_prob_of to lm3g_prob, to avoid conflict
218 * with Roni's ILM function of the same name.
220 * Revision 6.3 93/10/09 17:01:55 rkm
221 * M K Ravishankar (rkm@cs) at Carnegie Mellon
222 * Cleaned up handling precompiled binary 3g LM file,
223 * Added SWAP option for HP platforms.
225 * Revision 6.2 93/10/06 11:08:15 rkm
226 * M K Ravishankar (rkm@cs) at Carnegie Mellon University
227 * Darpa Trigram LM module. Created.
230 #include <stdio.h>
231 #include <string.h>
232 #include <stdlib.h>
233 #include <sys/types.h>
234 #include <sys/stat.h>
236 #define QUIT(x) {fflush(stdout); fprintf x; exit(-1);}
238 #include "s2types.h"
239 #include "CM_macros.h"
240 #include "assert.h"
241 #include "strfuncs.h"
242 #include "linklist.h"
243 #include "list.h"
244 #include "hash.h"
245 #include "err.h"
246 #include "lm_3g.h"
247 #include "log.h"
249 #define NO_WORD 0
251 #define UG_MAPID(m,u) ((m)->unigrams[u].mapid)
252 #define UG_PROB_F(m,u) ((m)->unigrams[u].prob1.f)
253 #define UG_BO_WT_F(m,u) ((m)->unigrams[u].bo_wt1.f)
254 #define UG_PROB_L(m,u) ((m)->unigrams[u].prob1.l)
255 #define UG_BO_WT_L(m,u) ((m)->unigrams[u].bo_wt1.l)
256 #define FIRST_BG(m,u) ((m)->unigrams[u].bigrams)
257 #define LAST_BG(m,u) (FIRST_BG((m),(u)+1)-1)
259 #define BG_WID(m,b) ((m)->bigrams[b].wid)
260 #define BG_PROB_F(m,b) ((m)->prob2[(m)->bigrams[b].prob2].f)
261 #define BG_BO_WT_F(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].f)
262 #define BG_PROB_L(m,b) ((m)->prob2[(m)->bigrams[b].prob2].l)
263 #define BG_BO_WT_L(m,b) ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].l)
264 #define TSEG_BASE(m,b) ((m)->tseg_base[(b)>>LOG_BG_SEG_SZ])
265 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->bigrams[b].trigrams))
266 #define LAST_TG(m,b) (FIRST_TG((m),(b)+1)-1)
268 #define TG_WID(m,t) ((m)->trigrams[t].wid)
269 #define TG_PROB_F(m,t) ((m)->prob3[(m)->trigrams[t].prob3].f)
270 #define TG_PROB_L(m,t) ((m)->prob3[(m)->trigrams[t].prob3].l)
272 /* FIXME: put this in a header file */
273 extern void quit (int status, char const *fmt, ...);
275 static char const *start_sym = "<s>";
276 static char const *end_sym = "</s>";
280 #define MIN_PROB_F -99.0
282 #define MAX_SORTED_ENTRIES 65534
284 /* Base values for ranges of unigram_t.mapid */
285 #define LM_DICTWID_BASE 0 /* Do not change this */
286 #define LM_CLASSID_BASE 0x01000000 /* Interpreted as LMclass ID */
287 #define LM_DICTWID_BADMAP -16000 /* An illegal mapping */
292 * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed
293 * to replace spurious values in the Darpa LM file.
295 static void init_sorted_list (sorted_list_t *l)
297 l->list =
298 (sorted_entry_t *) CM_calloc (MAX_SORTED_ENTRIES, sizeof (sorted_entry_t));
299 l->list[0].val.f = MIN_PROB_F;
300 l->list[0].lower = 0;
301 l->list[0].higher = 0;
302 l->free = 1;
305 static void free_sorted_list (sorted_list_t *l)
307 free (l->list);
310 static log_t *vals_in_sorted_list (sorted_list_t *l)
312 log_t *vals;
313 int32 i;
315 vals = (log_t *) CM_calloc (l->free, sizeof (log_t));
316 for (i = 0; i < l->free; i++)
317 vals[i].f = l->list[i].val.f;
318 return (vals);
321 static int32 sorted_id (sorted_list_t *l, float *val)
323 int32 i = 0;
325 for (;;) {
326 if (*val == l->list[i].val.f)
327 return (i);
328 if (*val < l->list[i].val.f) {
329 if (l->list[i].lower == 0) {
330 if (l->free >= MAX_SORTED_ENTRIES)
331 E_FATAL("sorted list overflow\n");
333 l->list[i].lower = l->free;
334 (l->free)++;
335 i = l->list[i].lower;
336 l->list[i].val.f = *val;
337 return (i);
338 } else
339 i = l->list[i].lower;
340 } else {
341 if (l->list[i].higher == 0) {
342 if (l->free >= MAX_SORTED_ENTRIES)
343 E_FATAL("sorted list overflow\n");
345 l->list[i].higher = l->free;
346 (l->free)++;
347 i = l->list[i].higher;
348 l->list[i].val.f = *val;
349 return (i);
350 } else
351 i = l->list[i].higher;
357 * allocate, initialize and return pointer to an array of unigram entries.
359 static unigram_t *NewUnigramTable (int32 n_ug)
361 unigram_t *table;
362 int32 i;
364 table = (unigram_t *) CM_calloc (n_ug, sizeof (unigram_t));
365 for (i = 0; i < n_ug; i++) {
366 table[i].mapid = NO_WORD;
367 table[i].prob1.f = -99.0;
368 table[i].bo_wt1.f = -99.0;
370 return table;
374 * returns a pointer to a new language model record. The size is passed in
375 * as a parameter.
377 lm_t *
378 NewModel (n_ug, n_bg, n_tg)
379 int32 n_ug;
380 int32 n_bg;
381 int32 n_tg;
383 lm_t *model;
385 model = (lm_t *) CM_calloc (1, sizeof (lm_t));
388 * Allocate one extra unigram and bigram entry: sentinels to terminate
389 * followers (bigrams and trigrams, respectively) of previous entry.
391 model->unigrams = NewUnigramTable (n_ug+1);
392 model->bigrams = (bigram_t *) CM_calloc (n_bg+1, sizeof (bigram_t));
393 if (n_tg > 0)
394 model->trigrams = (trigram_t *) CM_calloc (n_tg, sizeof (trigram_t));
396 if (n_tg > 0) {
397 model->tseg_base = (int32 *) CM_calloc ((n_bg+1)/BG_SEG_SZ+1, sizeof (int32));
398 #if 0
399 E_INFO("%8d = tseg_base entries allocated\n",
400 (n_bg+1)/BG_SEG_SZ+1);
401 #endif
404 model->max_ucount = model->ucount = n_ug;
405 model->bcount = n_bg;
406 model->tcount = n_tg;
408 model->HT.size = 0;
409 model->HT.inuse = 0;
410 model->HT.tab = NULL;
412 return model;
415 int32 lm3g_wstr2wid (lm_t *model, const char *w)
417 caddr_t val;
419 if (hash_lookup (&(model->HT), w, &val) != 0)
420 return NO_WORD;
421 return ((int32) val);
425 * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
427 static void ReadNgramCounts (FILE *fp,
428 int32 *n_ug, int32 *n_bg, int32 *n_tg)
430 char string[256];
431 int32 ngram, ngram_cnt;
433 /* skip file until past the '\data\' marker */
435 fgets (string, sizeof (string), fp);
436 while ( (strcmp (string, "\\data\\\n") != 0) && (! feof (fp)) );
438 if (strcmp (string, "\\data\\\n") != 0)
439 E_FATAL("No \\data\\ mark in LM file\n");
441 *n_ug = *n_bg = *n_tg = 0;
442 while (fgets (string, sizeof (string), fp) != NULL) {
443 if (sscanf (string, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
444 break;
445 switch (ngram) {
446 case 1: *n_ug = ngram_cnt;
447 break;
448 case 2: *n_bg = ngram_cnt;
449 break;
450 case 3: *n_tg = ngram_cnt;
451 break;
452 default:
453 E_FATAL("Unknown ngram (%d)\n", ngram);
454 break;
458 /* Position file to just after the unigrams header '\1-grams:\' */
459 while ( (strcmp (string, "\\1-grams:\n") != 0) && (! feof (fp)) )
460 fgets (string, sizeof (string), fp);
462 /* Check counts; NOTE: #trigrams *CAN* be 0 */
463 if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0))
464 E_FATAL("Bad or missing ngram count\n");
468 * Read in the unigrams from given file into the LM structure model. On
469 * entry to this procedure, the file pointer is positioned just after the
470 * header line '\1-grams:'.
472 static void ReadUnigrams (FILE *fp, lm_t *model)
474 char string[256];
475 char name[128];
476 int32 wcnt;
477 float p1, bo_wt;
479 E_INFO ("Reading unigrams\n");
481 wcnt = 0;
482 while ((fgets (string, sizeof(string), fp) != NULL) &&
483 (strcmp (string, "\\2-grams:\n") != 0))
485 if (sscanf (string, "%f %s %f", &p1, name, &bo_wt) != 3) {
486 if (string[0] != '\n')
487 E_WARN ("Format error; unigram ignored:%s", string);
488 continue;
491 if (wcnt >= model->ucount)
492 E_FATAL("Too many unigrams\n");
494 /* Associate name with word id */
495 model->word_str[wcnt] = (char *) salloc (name);
496 hash_add (&(model->HT), model->word_str[wcnt], (caddr_t) (wcnt+1));
497 model->unigrams[wcnt].prob1.f = p1;
498 model->unigrams[wcnt].bo_wt1.f = bo_wt;
500 model->unigrams[wcnt].mapid = wcnt;
501 wcnt++;
504 if (model->ucount != wcnt) {
505 E_WARN ("lm_t.ucount(%d) != #unigrams read(%d)\n",
506 model->ucount, wcnt);
507 model->ucount = wcnt;
512 * Read bigrams from given file into given model structure. File may be arpabo
513 * or arpabo-id format, depending on idfmt = 0 or 1.
515 static void ReadBigrams (FILE *fp, lm_t *model, int32 idfmt)
517 char string[1024], word1[256], word2[256];
518 int32 w1, w2, prev_w1, bgcount, p;
519 bigram_t *bgptr;
520 float p2, bo_wt;
521 int32 n_fld, n;
523 E_INFO ("Reading bigrams\n");
525 bgcount = 0;
526 bgptr = model->bigrams;
527 prev_w1 = -1;
528 n_fld = (model->tcount > 0) ? 4 : 3;
530 bo_wt = 0.0;
531 while (fgets (string, sizeof(string), fp) != NULL) {
532 if (! idfmt)
533 n = sscanf (string, "%f %s %s %f", &p2, word1, word2, &bo_wt);
534 else
535 n = sscanf (string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt);
536 if (n < n_fld) {
537 if (string[0] != '\n')
538 break;
539 continue;
542 if (! idfmt) {
543 if ((w1 = lm3g_wstr2wid (model, word1)) == NO_WORD)
544 E_FATAL("Unknown word: %s\n", word1);
545 w1 --; // hash value is 1-base
546 if ((w2 = lm3g_wstr2wid (model, word2)) == NO_WORD)
547 E_FATAL("Unknown word: %s\n", word2);
548 w2 --; // hash value is 1-base
549 } else {
550 if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w1 < 0) || (w2 < 0))
551 E_FATAL("Bad bigram: %s", string);
554 /* HACK!! to quantize probs to 4 decimal digits */
555 p = p2*10000;
556 p2 = p*0.0001;
557 p = bo_wt*10000;
558 bo_wt = p*0.0001;
560 if (bgcount >= model->bcount)
561 E_FATAL("Too many bigrams\n");
563 bgptr->wid = w2;
564 bgptr->prob2 = sorted_id (&model->sorted_prob2, &p2);
565 if (model->tcount > 0)
566 bgptr->bo_wt2 = sorted_id (&model->sorted_bo_wt2, &bo_wt);
568 if (w1 != prev_w1) {
569 if (w1 < prev_w1)
570 E_FATAL("Bigrams not in unigram order\n");
572 for (prev_w1++; prev_w1 <= w1; prev_w1++)
573 model->unigrams[prev_w1].bigrams = bgcount;
574 prev_w1 = w1;
577 bgcount++;
578 bgptr++;
580 if ((bgcount & 0x0000ffff) == 0) {
581 E_INFOCONT (".");
584 if ((strcmp (string, "\\end\\\n") != 0) && (strcmp (string, "\\3-grams:\n") != 0))
585 E_FATAL("Bad bigram: %s\n", string);
587 for (prev_w1++; prev_w1 <= model->ucount; prev_w1++)
588 model->unigrams[prev_w1].bigrams = bgcount;
592 * Very similar to ReadBigrams.
594 static void ReadTrigrams (FILE *fp, lm_t *model, int32 idfmt)
596 char string[1024], word1[256], word2[256], word3[256];
597 int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg, p;
598 int32 seg, prev_seg, prev_seg_lastbg;
599 trigram_t *tgptr;
600 bigram_t *bgptr;
601 float p3;
603 E_INFO ("Reading trigrams\n");
605 tgcount = 0;
606 tgptr = model->trigrams;
607 prev_w1 = -1;
608 prev_w2 = -1;
609 prev_bg = -1;
610 prev_seg = -1;
612 while (fgets (string, sizeof(string), fp) != NULL) {
613 if (! idfmt)
614 n = sscanf (string, "%f %s %s %s", &p3, word1, word2, word3);
615 else
616 n = sscanf (string, "%f %d %d %d", &p3, &w1, &w2, &w3);
617 if (n != 4) {
618 if (string[0] != '\n')
619 break;
620 continue;
623 if (! idfmt) {
624 if ((w1 = lm3g_wstr2wid (model, word1)) == NO_WORD)
625 E_FATAL("Unknown word: %s\n", word1);
626 w1 --; // hash value is 1-base
627 if ((w2 = lm3g_wstr2wid (model, word2)) == NO_WORD)
628 E_FATAL("Unknown word: %s\n", word2);
629 w2 --; // hash value is 1-base
630 if ((w3 = lm3g_wstr2wid (model, word3)) == NO_WORD)
631 E_FATAL("Unknown word: %s\n", word3);
632 w3 --; // hash value is 1-base
633 } else {
634 if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w3 >= model->ucount) ||
635 (w1 < 0) || (w2 < 0) || (w3 < 0))
636 E_FATAL("Bad trigram: %s", string);
639 /* HACK!! to quantize probs to 4 decimal digits */
640 p = p3*10000;
641 p3 = p*0.0001;
643 if (tgcount >= model->tcount)
644 E_FATAL("Too many trigrams\n");
646 tgptr->wid = w3;
647 tgptr->prob3 = sorted_id (&model->sorted_prob3, &p3);
649 if ((w1 != prev_w1) || (w2 != prev_w2)) {
650 /* Trigram for a new bigram; update tg info for all previous bigrams */
651 if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2)))
652 E_FATAL("Trigrams not in bigram order\n");
654 bg = (w1 != prev_w1) ? model->unigrams[w1].bigrams : prev_bg+1;
655 endbg = model->unigrams[w1+1].bigrams;
656 bgptr = model->bigrams + bg;
657 for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
658 if (bg >= endbg)
659 E_FATAL("Missing bigram for trigram: %s", string);
661 /* bg = bigram entry index for <w1,w2>. Update tseg_base */
662 seg = bg >> LOG_BG_SEG_SZ;
663 for (i = prev_seg+1; i <= seg; i++)
664 model->tseg_base[i] = tgcount;
666 /* Update trigrams pointers for all bigrams until bg */
667 if (prev_seg < seg) {
668 int32 tgoff = 0;
670 if (prev_seg >= 0) {
671 tgoff = tgcount - model->tseg_base[prev_seg];
672 if (tgoff > 65535)
673 E_FATAL("Offset from tseg_base > 65535\n");
676 prev_seg_lastbg = ((prev_seg+1) << LOG_BG_SEG_SZ) - 1;
677 bgptr = model->bigrams + prev_bg;
678 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; prev_bg++, bgptr++)
679 bgptr->trigrams = tgoff;
681 for (; prev_bg <= bg; prev_bg++, bgptr++)
682 bgptr->trigrams = 0;
683 } else {
684 int32 tgoff;
686 tgoff = tgcount - model->tseg_base[prev_seg];
687 if (tgoff > 65535)
688 E_FATAL("Offset from tseg_base > 65535\n");
690 bgptr = model->bigrams + prev_bg;
691 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
692 bgptr->trigrams = tgoff;
695 prev_w1 = w1;
696 prev_w2 = w2;
697 prev_bg = bg;
698 prev_seg = seg;
701 tgcount++;
702 tgptr++;
704 if ((tgcount & 0x0000ffff) == 0) {
705 E_INFOCONT (".");
708 if (strcmp (string, "\\end\\\n") != 0)
709 E_FATAL("Bad trigram: %s\n", string);
711 for (prev_bg++; prev_bg <= model->bcount; prev_bg++) {
712 if ((prev_bg & (BG_SEG_SZ-1)) == 0)
713 model->tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
714 if ((tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535)
715 E_FATAL("Offset from tseg_base > 65535\n");
716 model->bigrams[prev_bg].trigrams =
717 tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ];
721 static FILE *lm_file_open (char const *filename, int32 usepipe)
723 char command[1024];
724 FILE *fp;
726 if (usepipe) {
727 #ifdef WIN32
728 sprintf (command, "D:\\compress\\gzip.exe -d -c %s", filename);
729 if ((fp = _popen (command, "r")) == NULL)
730 E_FATAL("Cannot popen %s\n", command);
731 #else
732 sprintf (command, "zcat %s", filename);
733 if ((fp = popen (command, "r")) == NULL)
734 E_FATAL("Cannot popen %s\n", command);
735 #endif
737 else {
738 fp = CM_fopen (filename, "r");
740 return (fp);
744 * Read in a trigram language model from the given file. The LM tokens can be word
745 * classes. However a given actual word can belong to AT MOST ONE of the LM classes
746 * used by this LM.
748 lm_t* lm3g_read (char const *filename)
750 lm_t *model;
751 FILE *fp = NULL;
752 int32 usingPipe = FALSE;
753 int32 n_unigram;
754 int32 n_bigram;
755 int32 n_trigram;
756 int32 dict_size;
757 int32 i, j, k, last_bg, last_tg;
758 struct stat statbuf;
759 int32 idfmt;
761 E_INFO ("Reading LM file %s\n", filename);
763 /* Check if a compressed file */
764 k = strlen(filename);
765 #ifdef WIN32
766 usingPipe = (k > 3) &&
767 ((strcmp (filename+k-3, ".gz") == 0) || (strcmp (filename+k-3, ".GZ") == 0));
768 #else
769 usingPipe = (k > 2) &&
770 ((strcmp (filename+k-2, ".Z") == 0) || (strcmp (filename+k-2, ".z") == 0));
771 #endif
772 /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */
773 if (usingPipe)
774 k -= 2;
775 idfmt = ((k > 3) && (strncmp (filename+k-3, "-id", 3) == 0));
777 fp = lm_file_open (filename, usingPipe);
778 if (stat(filename, &statbuf) < 0)
779 E_FATAL("stat(%s) failed\n", filename);
781 /* Read #unigrams, #bigrams, #trigrams from file */
782 ReadNgramCounts (fp, &n_unigram, &n_bigram, &n_trigram);
783 E_INFO ("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
785 /* Determine dictionary size (for dict-wid -> LM-wid map) */
786 dict_size = n_unigram;
788 if (dict_size >= 65535)
789 E_FATAL("#dict-words(%d) > 65534\n", dict_size);
791 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
792 model = NewModel (n_unigram, n_bigram, n_trigram);
793 model->word_str = (char **) CM_calloc (n_unigram, sizeof (char *));
794 model->n_word_str = n_unigram;
796 /* Load the precompiled binary dump form of the Darpa LM file if it exists */
797 if (1) {
798 ReadUnigrams (fp, model);
799 E_INFO("%8d = #unigrams created\n", model->ucount);
801 init_sorted_list (&model->sorted_prob2);
802 if (model->tcount > 0)
803 init_sorted_list (&model->sorted_bo_wt2);
805 ReadBigrams (fp, model, idfmt);
807 model->bcount = FIRST_BG(model,model->ucount);
808 model->n_prob2 = model->sorted_prob2.free;
809 model->prob2 = vals_in_sorted_list (&model->sorted_prob2);
810 free_sorted_list (&model->sorted_prob2);
811 E_INFO("\n%8d = #bigrams created\n", model->bcount);
812 E_INFO("%8d = #prob2 entries\n", model->n_prob2);
814 if (model->tcount > 0) {
815 /* Create trigram bo-wts array */
816 model->n_bo_wt2 = model->sorted_bo_wt2.free;
817 model->bo_wt2 = vals_in_sorted_list (&model->sorted_bo_wt2);
818 free_sorted_list (&model->sorted_bo_wt2);
819 E_INFO("%8d = #bo_wt2 entries\n", model->n_bo_wt2);
821 init_sorted_list (&model->sorted_prob3);
823 ReadTrigrams (fp, model, idfmt);
825 model->tcount = FIRST_TG(model,model->bcount);
826 model->n_prob3 = model->sorted_prob3.free;
827 model->prob3 = vals_in_sorted_list (&model->sorted_prob3);
828 E_INFO("\n%8d = #trigrams created\n", model->tcount);
829 E_INFO("%8d = #prob3 entries\n", model->n_prob3);
831 free_sorted_list (&model->sorted_prob3);
835 fclose (fp);
838 * Discourage expansion of end_sym and transition to start_sym. (The given
839 * Darpa LM may contain some spurious values that don't reflect these
840 * requirements.)
842 /* bo_wt(</s>) = MIN_PROB_F */
843 for (i = 0; (i < model->ucount) && (strcmp (model->word_str[i], end_sym) != 0); i++);
844 E_INFO("bo_wt(%s) changed from %.4f to %.4f\n",
845 model->word_str[i], model->unigrams[i].bo_wt1.f, MIN_PROB_F);
846 model->unigrams[i].bo_wt1.f = MIN_PROB_F;
848 /* unigram prob(<s>) = MIN_PROB_F */
849 for (i = 0; (i < model->ucount) && (strcmp (model->word_str[i], start_sym) != 0); i++);
850 E_INFO("prob(%s) changed from %.4f to %.4f\n",
851 model->word_str[i], model->unigrams[i].prob1.f, MIN_PROB_F);
852 model->unigrams[i].prob1.f = MIN_PROB_F;
854 /* bigram prob(<s>,<s>) = MIN_PROB_F (if bigram exists) */
855 j = FIRST_BG(model,i);
856 last_bg = LAST_BG(model,i);
857 for (; (j<=last_bg) && (strcmp(model->word_str[BG_WID(model,j)],start_sym)!=0); j++);
858 if (j <= last_bg) {
859 E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
860 model->word_str[i], model->word_str[BG_WID(model,j)],
861 model->prob2[model->bigrams[j].prob2].f,
862 model->prob2[0].f);
863 model->bigrams[j].prob2 = 0;
865 if (model->tcount > 0) {
866 /* trigram prob(<s>,<s>,<s>) = MIN_PROB_F (if trigram exists) */
867 k = FIRST_TG(model,j);
868 last_tg = LAST_TG(model,j);
869 for (; k <= last_tg; k++) {
870 if (strcmp (model->word_str[TG_WID(model,k)], start_sym) == 0)
871 break;
873 if (k <= last_tg) {
874 E_INFO("prob(%s,%s,%s) changed from %.4f to %.4f\n",
875 model->word_str[i], model->word_str[BG_WID(model,j)], model->word_str[TG_WID(model,k)],
876 model->prob3[model->trigrams[k].prob3].f,
877 model->prob3[0].f);
878 model->trigrams[k].prob3 = 0;
883 /* bigram prob(<s>,</s>) = MIN_PROB_F (if bigram exists) */
884 j = FIRST_BG(model,i);
885 last_bg = LAST_BG(model,i);
886 for (; (j<=last_bg) && (strcmp(model->word_str[BG_WID(model,j)],end_sym)!=0); j++);
887 if (j <= last_bg) {
888 E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
889 model->word_str[i], model->word_str[BG_WID(model,j)],
890 model->prob2[model->bigrams[j].prob2].f,
891 model->prob2[0].f);
892 model->bigrams[j].prob2 = 0;
895 return model;
898 void lm3g_free (lm_t *model)
900 int32 i,u;
901 tginfo_t *tginfo, *next_tginfo;
903 hash_free (&(model->HT));
904 for (i = 0; i < model->ucount; i++)
905 free (model->word_str[i]);
906 free (model->word_str);
908 free (model->unigrams);
909 free (model->bigrams);
910 free (model->prob2);
911 if (model->tcount > 0) {
912 free (model->trigrams);
913 free (model->tseg_base);
914 free (model->bo_wt2);
915 free (model->prob3);
917 if (model->HT.tab != NULL)
918 hash_free (&model->HT);
920 if (model->tginfo) {
921 for (u = 0; u < model->max_ucount; u++)
922 for (tginfo = model->tginfo[u]; tginfo; tginfo = next_tginfo) {
923 next_tginfo = tginfo->next;
924 listelem_free ((void *)tginfo, sizeof(tginfo_t));
926 free (model->tginfo);
929 free (model);
932 void lmSetStartSym (char const *sym)
933 /*----------------------------*
934 * Description - reconfigure the start symbol
937 start_sym = (char *) salloc(sym);
940 void lmSetEndSym (char const *sym)
941 /*----------------------------*
942 * Description - reconfigure the end symbol
945 end_sym = (char *) salloc(sym);
948 #ifdef NO_DICT
949 main (argc, argv)
950 int32 argc;
951 char *argv[];
953 lm_t *model;
954 int32 i, n, score;
955 float lw, uw, wip;
956 char wd[3][100], line[1000];
957 int32 wid[3];
958 int32 unkwid;
960 if (argc < 4)
961 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
962 if (sscanf(argv[1], "%f", &lw) != 1)
963 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
964 if (sscanf(argv[2], "%f", &uw) != 1)
965 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
966 if (sscanf(argv[3], "%f", &wip) != 1)
967 E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
968 if (argc == 5)
969 lm_read(argv[4], lw, uw, wip, 1.0);
970 else
971 lm_read("/net/alf8/cdrom/wsj1/grammar/tgboc20o.nvp", lw, uw, wip, 1.0);
973 lmSetParameters (model, 0);
975 unkwid = lm3g_wstr2wid (model, "<UNK>");
976 unkwid --; // hash value is 1-base
977 for (;;) {
978 printf ("Enter 1, 2, or 3 words: ");
979 fgets (line, sizeof(line), stdin);
980 if (((n = sscanf (line, "%s %s %s", wd[0], wd[1], wd[2])) < 1) || (n > 3))
981 break;
982 for (i = 0; i < n; i++)
983 if ((wid[i] = lm3g_wstr2wid (model, wd[i])) == NO_WORD) {
984 printf (" %s -> <UNK>\n", wd[i]);
985 wid[i] = unkwid;
987 else
988 wid[i] --;
990 switch (n) {
991 case 1:
992 score = unigram_score (wid[0]);
993 printf ("unigram_score(%d) = %d\n",
994 wid[0], score);
995 break;
996 case 2:
997 score = bigram_score (wid[0], wid[1]);
998 printf ("bigram_score(%d, %d) = %d\n",
999 wid[0], wid[1], score);
1000 break;
1001 case 3:
1002 score = trigram_score (wid[0], wid[1], wid[2]);
1003 printf ("trigram_score(%d, %d, %d) = %d\n",
1004 wid[0], wid[1], wid[2], score);
1005 break;
1006 default:
1007 break;
1011 #endif
1013 #define BINARY_SEARCH_THRESH 16
1015 float lm3g_ug_score (lm_t *lm, int32 wid)
1017 if (wid >= lm->ucount)
1018 E_FATAL("dictwid[%d] not in LM\n", wid);
1020 return (lm->unigrams[wid].prob1.f);
1023 /* Locate a specific bigram within a bigram list */
1024 static int32 find_bg (bigram_t *bg, int32 n, int32 w)
1026 int32 i, b, e;
1028 /* Binary search until segment size < threshold */
1029 b = 0;
1030 e = n;
1031 while (e-b > BINARY_SEARCH_THRESH) {
1032 i = (b+e)>>1;
1033 if (bg[i].wid < w)
1034 b = i+1;
1035 else if (bg[i].wid > w)
1036 e = i;
1037 else
1038 return i;
1041 /* Linear search within narrowed segment */
1042 for (i = b; (i < e) && (bg[i].wid != w); i++);
1043 return ((i < e) ? i : -1);
1046 /* w1, w2 are dictionary (base-)word ids */
1047 float lm3g_bg_score (lm_t *lm, int32 w1, int32 w2)
1049 int32 i, n, b;
1050 float score;
1051 bigram_t *bg;
1053 /* lm->n_bg_score++; */
1055 if (w1 >= lm->ucount)
1056 E_FATAL("dictwid[%d] not in LM\n", w1);
1057 if (w2 >= lm->ucount)
1058 E_FATAL("dictwid[%d] not in LM\n", w2);
1060 b = FIRST_BG(lm, w1);
1061 n = FIRST_BG(lm, w1+1) - b;
1062 bg = lm->bigrams + b;
1064 if ((i = find_bg (bg, n, w2)) >= 0) {
1065 score = lm->prob2[bg[i].prob2].f;
1066 } else {
1067 /* lm->n_bg_bo++; */
1068 score = lm->unigrams[w1].bo_wt1.f + lm->unigrams[w2].prob1.f;
1071 return (score);
1074 static void load_tginfo (lm_t *lm, int32 lw1, int32 lw2)
1076 int32 i, n, b, t;
1077 bigram_t *bg;
1078 tginfo_t *tginfo;
1080 /* First allocate space for tg information for bg lw1,lw2 */
1081 tginfo = (tginfo_t *) listelem_alloc (sizeof(tginfo_t));
1082 tginfo->w1 = lw1;
1083 tginfo->tg = NULL;
1084 tginfo->next = lm->tginfo[lw2];
1085 lm->tginfo[lw2] = tginfo;
1087 /* Locate bigram lw1,lw2 */
1089 b = lm->unigrams[lw1].bigrams;
1090 n = lm->unigrams[lw1+1].bigrams - b;
1091 bg = lm->bigrams + b;
1093 if ((n > 0) && ((i = find_bg (bg, n, lw2)) >= 0)) {
1094 tginfo->bowt = lm->bo_wt2[bg[i].bo_wt2].f;
1096 /* Find t = Absolute first trigram index for bigram lw1,lw2 */
1097 b += i; /* b = Absolute index of bigram lw1,lw2 on disk */
1098 t = FIRST_TG(lm, b);
1100 tginfo->tg = lm->trigrams + t;
1102 /* Find #tg for bigram w1,w2 */
1103 tginfo->n_tg = FIRST_TG(lm, b+1) - t;
1104 } else { /* No bigram w1,w2 */
1105 tginfo->bowt = 0;
1106 tginfo->n_tg = 0;
1110 /* Similar to find_bg */
1111 static int32 find_tg (trigram_t *tg, int32 n, int32 w)
1113 int32 i, b, e;
1115 b = 0;
1116 e = n;
1117 while (e-b > BINARY_SEARCH_THRESH) {
1118 i = (b+e)>>1;
1119 if (tg[i].wid < w)
1120 b = i+1;
1121 else if (tg[i].wid > w)
1122 e = i;
1123 else
1124 return i;
1127 for (i = b; (i < e) && (tg[i].wid != w); i++);
1128 return ((i < e) ? i : -1);
1131 /* w1, w2, w3 are dictionary wids */
1132 float lm3g_tg_score (lm_t *lm, int32 w1, int32 w2, int32 w3)
1134 int32 i, n;
1135 float score;
1136 trigram_t *tg;
1137 tginfo_t *tginfo, *prev_tginfo;
1139 if ((lm->tcount <= 0) || (w1 < 0))
1140 return (lm3g_bg_score (lm, w2, w3));
1142 /* lm->n_tg_score++; */
1144 if (w1 >= lm->ucount)
1145 E_FATAL("dictwid[%d] not in LM\n", w1);
1146 if (w2 >= lm->ucount)
1147 E_FATAL("dictwid[%d] not in LM\n", w2);
1148 if (w3 >= lm->ucount)
1149 E_FATAL("dictwid[%d] not in LM\n", w3);
1151 prev_tginfo = NULL;
1152 for (tginfo = lm->tginfo[w2]; tginfo; tginfo = tginfo->next) {
1153 if (tginfo->w1 == w1)
1154 break;
1155 prev_tginfo = tginfo;
1158 if (! tginfo) {
1159 load_tginfo (lm, w1, w2);
1160 tginfo = lm->tginfo[w2];
1161 } else if (prev_tginfo) {
1162 prev_tginfo->next = tginfo->next;
1163 tginfo->next = lm->tginfo[w2];
1164 lm->tginfo[w2] = tginfo;
1167 tginfo->used = 1;
1169 /* Trigrams for w1,w2 now pointed to by tginfo */
1170 n = tginfo->n_tg;
1171 tg = tginfo->tg;
1172 if ((i = find_tg (tg, n, w3)) >= 0) {
1173 score = lm->prob3[tg[i].prob3].f;
1174 } else {
1175 /* lm->n_tg_bo++; */
1176 score = tginfo->bowt + lm3g_bg_score(lm, w2, w3);
1179 return (score);
1182 int lm3g_word_str_size(lm_t *lm)
1184 return lm->n_word_str;