libvspell/lm/lm_3g.c

   1 /* ====================================================================
   2  * Copyright (c) 1999-2001 Carnegie Mellon University.  All rights
   3  * reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in
  14  *    the documentation and/or other materials provided with the
  15  *    distribution.
  16  *
  17  * This work was supported in part by funding from the Defense Advanced
  18  * Research Projects Agency and the National Science Foundation of the
  19  * United States of America, and the CMU Sphinx Speech Consortium.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
  22  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  23  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  24  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
  25  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32  *
  33  * ====================================================================
  34  *
  35  */
  36
  37 /*
  38  *
  39  * lm_3g.c -- Darpa Trigram LM module.
  40  *
  41  * HISTORY
  42  *
  43  * $Log: lm_3g.c,v $
  44  * Revision 1.14  2005/09/01 21:09:54  dhdfu
  45  * Really, actually, truly consolidate byteswapping operations into
  46  * byteorder.h.  Where unconditional byteswapping is needed, SWAP_INT32()
  47  * and SWAP_INT16() are to be used.  The WORDS_BIGENDIAN macro from
  48  * autoconf controls the functioning of the conditional swap macros
  49  * (SWAP_?[LW]) whose names and semantics have been regularized.
  50  * Private, adhoc macros have been removed.
  51  *
  52  * Revision 1.13  2004/12/10 16:48:56  rkm
  53  * Added continuous density acoustic model handling
  54  *
  55  * Revision 1.12  2004/07/16 00:57:11  egouvea
  56  * Added Ravi's implementation of FSG support.
  57  *
  58  * Revision 1.1.1.1  2004/03/01 14:30:29  rkm
  59  *
  60  *
  61  * Revision 1.2  2004/02/27 16:15:13  rkm
  62  * Added FSG switching
  63  *
  64  * Revision 1.1.1.1  2003/12/03 20:05:04  rkm
  65  * Initial CVS repository
  66  *
  67  * Revision 1.11  2001/12/11 00:24:48  lenzo
  68  * Acknowledgement in License.
  69  *
  70  * Revision 1.10  2001/12/07 17:30:02  lenzo
  71  * Clean up and remove extra lines.
  72  *
  73  * Revision 1.9  2001/12/07 13:11:30  lenzo
  74  * Consolidate byte order code into byteorder.h.  Note that there are still
  75  * two "senses" of byte swapping that are confusing, and all this should
  76  * be cleaned up and verified.
  77  *
  78  * Revision 1.8  2001/12/07 05:09:30  lenzo
  79  * License.xsxc
  80  *
  81  * Revision 1.7  2001/12/07 04:27:35  lenzo
  82  * License cleanup.  Remove conditions on the names.  Rationale: These
  83  * conditions don't belong in the license itself, but in other fora that
  84  * offer protection for recognizeable names such as "Carnegie Mellon
  85  * University" and "Sphinx."  These changes also reduce interoperability
  86  * issues with other licenses such as the Mozilla Public License and the
  87  * GPL.  This update changes the top-level license files and removes the
  88  * old license conditions from each of the files that contained it.
  89  * All files in this collection fall under the copyright of the top-level
  90  * LICENSE file.
  91  *
  92  * Revision 1.6  2001/10/23 22:20:30  lenzo
  93  * Change error logging and reporting to the E_* macros that call common
  94  * functions.  This will obsolete logmsg.[ch] and they will be removed
  95  * or changed in future versions.
  96  *
  97  * Revision 1.5  2001/02/13 19:51:38  lenzo
  98  * *** empty log message ***
  99  *
 100  * Revision 1.4  2001/02/13 18:50:35  lenzo
 101  * Adding some more comments for a Solaris port.
 102  *
 103  * Revision 1.3  2000/12/12 23:01:42  lenzo
 104  * Rationalizing libs and names some more.  Split a/d and fe libs out.
 105  *
 106  * Revision 1.2  2000/12/05 01:45:12  lenzo
 107  * Restructuring, hear rationalization, warning removal, ANSIfy
 108  *
 109  * Revision 1.1.1.1  2000/01/28 22:08:51  lenzo
 110  * Initial import of sphinx2
 111  *
 112  *
 113  * 28-Oct-98    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 114  *              Added lm3g_access_type() and necessary support.
 115  *
 116  * 15-Oct-98    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 117  *              Bugfix: inclass_ugscore[lw3] changed to inclass_ugscore[w3] in
 118  *              lm3g_tg_score().  (Thanks to dbansal@cs.)
 119  *
 120  * 15-Jul-98    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 121  *              Corrected references to unigram_t.wid to unigram_t.mapid.
 122  *
 123  * 14-Apr-98    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 124  *              Added lm3g_n_lm() and lm3g_index2name().
 125  *
 126  * 03-Apr-97    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 127  *              Added lm3g_raw_score() and lm_t.invlw.
 128  *              Changed a number of function names from lm_... to lm3g_...
 129  *
 130  * 09-Jan-97    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 131  *              BUGFIX: Added check for lmp->unigrams[i].wid in lm_set_current().
 132  *
 133  * 06-Dec-95    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 134  *              Changed function name lmname_to_lm() to lm_name2lm().
 135  *
 136  * 06-Dec-95    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 137  *              Changed function name get_current_lm to lm_get_current.
 138  *              Changed check for already existing word in lm_add_word, and added
 139  *              condition to updating dictwid_map.
 140  *
 141  * 01-Jul-95    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 142  *              Removed LM cache and replaced with find_bg and find_tg within the main
 143  *              bigrams and trigram structures.  No loss of speed and uses less memory.
 144  *
 145  * 24-Jun-95    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 146  *              Fixed a number of memory leaks while deleting an LM.  Added the global
 147  *              dictwid_map, and allocated it once and never freed.  Made sure lm_cache
 148  *              is created only once.
 149  *
 150  * 14-Jun-95    M K Ravishankar (rkm@cs) at Carnegie Mellon University
 151  *              Modified lm_read to return 0 on success, and to delete any existing LM
 152  *              with the new LM name (instead of reporting error and exiting).
 153  *              Added backslash option in building filenames (for PC compatibility).
 154  *
 155  * Revision 8.9  94/10/11  12:36:28  rkm
 156  * Changed lm_tg_score to call lm_bg_score if no trigrams present or
 157  * the first word is invalid.
 158  *
 159  * Revision 8.8  94/07/29  11:54:23  rkm
 160  * Renamed lmSetParameters to lm_set_param and moved it into lm_add().
 161  * Added functions lm_init_oov() to create initial list of OOVs,
 162  * and lm_add_word() to add new OOV at run time.
 163  *
 164  * Revision 8.7  94/05/19  14:19:59  rkm
 165  * Rewrote LM cache code for greater efficiency.
 166  *
 167  * Revision 8.6  94/05/10  10:47:58  rkm
 168  * Added lm_add() and lm_set_param() functions, for dynamically adding a new
 169  * in-memory LM to the set of available LMs.
 170  *
 171  * Revision 8.5  94/04/22  13:53:27  rkm
 172  * Added query_lm_cache_lines() to allow run-time spec of #cache lines.
 173  *
 174  * Revision 8.4  94/04/14  15:08:46  rkm
 175  * Added function lm_delete() to delete a named LM and reclaim space.
 176  *
 177  * Revision 8.3  94/04/14  14:40:27  rkm
 178  * Minor changes.
 179  *
 180  * Revision 8.1  94/02/15  15:09:22  rkm
 181  * Derived from v7.  Includes multiple LMs for grammar switching.
 182  *
 183  * Revision 6.13  94/02/11  13:14:45  rkm
 184  * Added bigram and trigram multi-line caches, and functions, for v7.
 185  * Replaced sequential search in lm3g_wstr2wid() with hash_lookup().
 186  *
 187  * Revision 6.12  94/01/07  10:56:16  rkm
 188  * Corrected bug relating to input file format.
 189  *
 190  * Revision 6.11  93/12/17  13:14:52  rkm
 191  * *** empty log message ***
 192  *
 193  * Revision 6.10  93/12/03  17:09:59  rkm
 194  * Added ability to handle bigram-only dump files.
 195  * Added <s> </s> bigram -> MIN_PROB.
 196  * Added timestamp to dump files.
 197  *
 198  * Revision 6.9  93/12/01  12:29:55  rkm
 199  * Added ability to handle LM files containing only bigrams.
 200  * Excluded start_sym from interpolation of unigram prob with uniform prob.
 201  *
 202  *
 203  * 93/10/21 rkm@cs.cmu.edu
 204  * Added <c.h>
 205  *
 206  * Revision 6.6  93/10/19  18:58:10  rkm
 207  * Added code to change bigram-prob(<s>,<s>) to very low value.  The
 208  * Darpa LM file contains a spurious value to be ignored.
 209  * Fixed bug that dumps one trigram entry too many.
 210  *
 211  * Revision 6.5  93/10/15  15:00:14  rkm
 212  *
 213  * Revision 6.4  93/10/13  16:56:04  rkm
 214  * Added LM cache line stats.
 215  * Added bg_only option for lm_read parameter list
 216  * (but not yet implemented).
 217  * Changed proc name ilm_LOG_prob_of to lm3g_prob, to avoid conflict
 218  * with Roni's ILM function of the same name.
 219  *
 220  * Revision 6.3  93/10/09  17:01:55  rkm
 221  * M K Ravishankar (rkm@cs) at Carnegie Mellon
 222  * Cleaned up handling precompiled binary 3g LM file,
 223  * Added SWAP option for HP platforms.
 224  *
 225  * Revision 6.2  93/10/06  11:08:15  rkm
 226  * M K Ravishankar (rkm@cs) at Carnegie Mellon University
 227  * Darpa Trigram LM module.  Created.
 228  */
 229
 230 #include <stdio.h>
 231 #include <string.h>
 232 #include <stdlib.h>
 233 #include <sys/types.h>
 234 #include <sys/stat.h>
 235
 236 #define QUIT(x)         {fflush(stdout); fprintf x; exit(-1);}
 237
 238 #include "s2types.h"
 239 #include "CM_macros.h"
 240 #include "assert.h"
 241 #include "strfuncs.h"
 242 #include "linklist.h"
 243 #include "list.h"
 244 #include "hash.h"
 245 #include "err.h"
 246 #include "lm_3g.h"
 247 #include "log.h"
 248
 249 #define NO_WORD 0
 250
 251 #define UG_MAPID(m,u)           ((m)->unigrams[u].mapid)
 252 #define UG_PROB_F(m,u)          ((m)->unigrams[u].prob1.f)
 253 #define UG_BO_WT_F(m,u)         ((m)->unigrams[u].bo_wt1.f)
 254 #define UG_PROB_L(m,u)          ((m)->unigrams[u].prob1.l)
 255 #define UG_BO_WT_L(m,u)         ((m)->unigrams[u].bo_wt1.l)
 256 #define FIRST_BG(m,u)           ((m)->unigrams[u].bigrams)
 257 #define LAST_BG(m,u)            (FIRST_BG((m),(u)+1)-1)
 258
 259 #define BG_WID(m,b)             ((m)->bigrams[b].wid)
 260 #define BG_PROB_F(m,b)          ((m)->prob2[(m)->bigrams[b].prob2].f)
 261 #define BG_BO_WT_F(m,b)         ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].f)
 262 #define BG_PROB_L(m,b)          ((m)->prob2[(m)->bigrams[b].prob2].l)
 263 #define BG_BO_WT_L(m,b)         ((m)->bo_wt2[(m)->bigrams[b].bo_wt2].l)
 264 #define TSEG_BASE(m,b)          ((m)->tseg_base[(b)>>LOG_BG_SEG_SZ])
 265 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->bigrams[b].trigrams))
 266 #define LAST_TG(m,b)            (FIRST_TG((m),(b)+1)-1)
 267
 268 #define TG_WID(m,t)             ((m)->trigrams[t].wid)
 269 #define TG_PROB_F(m,t)          ((m)->prob3[(m)->trigrams[t].prob3].f)
 270 #define TG_PROB_L(m,t)          ((m)->prob3[(m)->trigrams[t].prob3].l)
 271
 272 /* FIXME: put this in a header file */
 273 extern void quit (int status, char const *fmt, ...);
 274
 275 static char const *start_sym = "<s>";
 276 static char const *end_sym = "</s>";
 277
 278
 279
 280 #define MIN_PROB_F              -99.0
 281
 282 #define MAX_SORTED_ENTRIES      65534
 283
 284 /* Base values for ranges of unigram_t.mapid */
 285 #define LM_DICTWID_BASE         0               /* Do not change this */
 286 #define LM_CLASSID_BASE         0x01000000      /* Interpreted as LMclass ID */
 287 #define LM_DICTWID_BADMAP       -16000          /* An illegal mapping */
 288
 289
 290
 291 /*
 292  * Initialize sorted list with the 0-th entry = MIN_PROB_F, which may be needed
 293  * to replace spurious values in the Darpa LM file.
 294  */
 295 static void init_sorted_list (sorted_list_t *l)
 296 {
 297     l->list =
 298         (sorted_entry_t *) CM_calloc (MAX_SORTED_ENTRIES, sizeof (sorted_entry_t));
 299     l->list[0].val.f = MIN_PROB_F;
 300     l->list[0].lower = 0;
 301     l->list[0].higher = 0;
 302     l->free = 1;
 303 }
 304
 305 static void free_sorted_list (sorted_list_t *l)
 306 {
 307     free (l->list);
 308 }
 309
 310 static log_t *vals_in_sorted_list (sorted_list_t *l)
 311 {
 312     log_t *vals;
 313     int32 i;
 314
 315     vals = (log_t *) CM_calloc (l->free, sizeof (log_t));
 316     for (i = 0; i < l->free; i++)
 317         vals[i].f = l->list[i].val.f;
 318     return (vals);
 319 }
 320
 321 static int32 sorted_id (sorted_list_t *l, float *val)
 322 {
 323     int32 i = 0;
 324
 325     for (;;) {
 326         if (*val == l->list[i].val.f)
 327             return (i);
 328         if (*val < l->list[i].val.f) {
 329             if (l->list[i].lower == 0) {
 330                 if (l->free >= MAX_SORTED_ENTRIES)
 331                     E_FATAL("sorted list overflow\n");
 332
 333                 l->list[i].lower = l->free;
 334                 (l->free)++;
 335                 i = l->list[i].lower;
 336                 l->list[i].val.f = *val;
 337                 return (i);
 338             } else
 339                 i = l->list[i].lower;
 340         } else {
 341             if (l->list[i].higher == 0) {
 342                 if (l->free >= MAX_SORTED_ENTRIES)
 343                     E_FATAL("sorted list overflow\n");
 344
 345                 l->list[i].higher = l->free;
 346                 (l->free)++;
 347                 i = l->list[i].higher;
 348                 l->list[i].val.f = *val;
 349                 return (i);
 350             } else
 351                 i = l->list[i].higher;
 352         }
 353     }
 354 }
 355
 356 /*
 357  * allocate, initialize and return pointer to an array of unigram entries.
 358  */
 359 static unigram_t *NewUnigramTable (int32 n_ug)
 360 {
 361     unigram_t *table;
 362     int32 i;
 363
 364     table = (unigram_t *) CM_calloc (n_ug, sizeof (unigram_t));
 365     for (i = 0; i < n_ug; i++) {
 366         table[i].mapid = NO_WORD;
 367         table[i].prob1.f = -99.0;
 368         table[i].bo_wt1.f = -99.0;
 369     }
 370     return table;
 371 }
 372
 373 /*
 374  * returns a pointer to a new language model record.  The size is passed in
 375  * as a parameter.
 376  */
 377 lm_t *
 378 NewModel (n_ug, n_bg, n_tg)
 379     int32 n_ug;
 380     int32 n_bg;
 381     int32 n_tg;
 382 {
 383     lm_t *model;
 384
 385     model = (lm_t *) CM_calloc (1, sizeof (lm_t));
 386
 387     /*
 388      * Allocate one extra unigram and bigram entry: sentinels to terminate
 389      * followers (bigrams and trigrams, respectively) of previous entry.
 390      */
 391     model->unigrams     = NewUnigramTable (n_ug+1);
 392     model->bigrams      = (bigram_t *) CM_calloc (n_bg+1, sizeof (bigram_t));
 393     if (n_tg > 0)
 394         model->trigrams = (trigram_t *) CM_calloc (n_tg, sizeof (trigram_t));
 395
 396     if (n_tg > 0) {
 397         model->tseg_base = (int32 *) CM_calloc ((n_bg+1)/BG_SEG_SZ+1, sizeof (int32));
 398 #if 0
 399         E_INFO("%8d = tseg_base entries allocated\n",
 400                (n_bg+1)/BG_SEG_SZ+1);
 401 #endif
 402     }
 403
 404     model->max_ucount = model->ucount = n_ug;
 405     model->bcount = n_bg;
 406     model->tcount = n_tg;
 407
 408     model->HT.size = 0;
 409     model->HT.inuse = 0;
 410     model->HT.tab = NULL;
 411
 412     return model;
 413 }
 414
 415 int32 lm3g_wstr2wid (lm_t *model, const char *w)
 416 {
 417     caddr_t val;
 418
 419     if (hash_lookup (&(model->HT), w, &val) != 0)
 420         return NO_WORD;
 421     return ((int32) val);
 422 }
 423
 424 /*
 425  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
 426  */
 427 static void ReadNgramCounts (FILE *fp,
 428                              int32 *n_ug, int32 *n_bg, int32 *n_tg)
 429 {
 430     char string[256];
 431     int32 ngram, ngram_cnt;
 432
 433     /* skip file until past the '\data\' marker */
 434     do
 435         fgets (string, sizeof (string), fp);
 436     while ( (strcmp (string, "\\data\\\n") != 0) && (! feof (fp)) );
 437
 438     if (strcmp (string, "\\data\\\n") != 0)
 439         E_FATAL("No \\data\\ mark in LM file\n");
 440
 441     *n_ug = *n_bg = *n_tg = 0;
 442     while (fgets (string, sizeof (string), fp) != NULL) {
 443         if (sscanf (string, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
 444             break;
 445         switch (ngram) {
 446         case 1: *n_ug = ngram_cnt;
 447             break;
 448         case 2: *n_bg = ngram_cnt;
 449             break;
 450         case 3: *n_tg = ngram_cnt;
 451             break;
 452         default:
 453             E_FATAL("Unknown ngram (%d)\n", ngram);
 454             break;
 455         }
 456     }
 457
 458     /* Position file to just after the unigrams header '\1-grams:\' */
 459     while ( (strcmp (string, "\\1-grams:\n") != 0) && (! feof (fp)) )
 460         fgets (string, sizeof (string), fp);
 461
 462     /* Check counts;  NOTE: #trigrams *CAN* be 0 */
 463     if ((*n_ug <= 0) || (*n_bg <= 0) || (*n_tg < 0))
 464         E_FATAL("Bad or missing ngram count\n");
 465 }
 466
 467 /*
 468  * Read in the unigrams from given file into the LM structure model.  On
 469  * entry to this procedure, the file pointer is positioned just after the
 470  * header line '\1-grams:'.
 471  */
 472 static void ReadUnigrams (FILE *fp, lm_t *model)
 473 {
 474     char string[256];
 475     char name[128];
 476     int32 wcnt;
 477     float p1, bo_wt;
 478
 479     E_INFO ("Reading unigrams\n");
 480
 481     wcnt = 0;
 482     while ((fgets (string, sizeof(string), fp) != NULL) &&
 483            (strcmp (string, "\\2-grams:\n") != 0))
 484     {
 485         if (sscanf (string, "%f %s %f", &p1, name, &bo_wt) != 3) {
 486             if (string[0] != '\n')
 487                 E_WARN ("Format error; unigram ignored:%s", string);
 488             continue;
 489         }
 490
 491         if (wcnt >= model->ucount)
 492             E_FATAL("Too many unigrams\n");
 493
 494         /* Associate name with word id */
 495         model->word_str[wcnt] = (char *) salloc (name);
 496         hash_add (&(model->HT), model->word_str[wcnt], (caddr_t) (wcnt+1));
 497         model->unigrams[wcnt].prob1.f = p1;
 498         model->unigrams[wcnt].bo_wt1.f = bo_wt;
 499
 500         model->unigrams[wcnt].mapid = wcnt;
 501         wcnt++;
 502     }
 503
 504     if (model->ucount != wcnt) {
 505         E_WARN ("lm_t.ucount(%d) != #unigrams read(%d)\n",
 506                 model->ucount, wcnt);
 507         model->ucount = wcnt;
 508     }
 509 }
 510
 511 /*
 512  * Read bigrams from given file into given model structure.  File may be arpabo
 513  * or arpabo-id format, depending on idfmt = 0 or 1.
 514  */
 515 static void ReadBigrams (FILE *fp, lm_t *model, int32 idfmt)
 516 {
 517     char string[1024], word1[256], word2[256];
 518     int32 w1, w2, prev_w1, bgcount, p;
 519     bigram_t *bgptr;
 520     float p2, bo_wt;
 521     int32 n_fld, n;
 522
 523     E_INFO ("Reading bigrams\n");
 524
 525     bgcount = 0;
 526     bgptr = model->bigrams;
 527     prev_w1 = -1;
 528     n_fld = (model->tcount > 0) ? 4 : 3;
 529
 530     bo_wt = 0.0;
 531     while (fgets (string, sizeof(string), fp) != NULL) {
 532         if (! idfmt)
 533             n = sscanf (string, "%f %s %s %f", &p2, word1, word2, &bo_wt);
 534         else
 535             n = sscanf (string, "%f %d %d %f", &p2, &w1, &w2, &bo_wt);
 536         if (n < n_fld) {
 537             if (string[0] != '\n')
 538                 break;
 539             continue;
 540         }
 541
 542         if (! idfmt) {
 543             if ((w1 = lm3g_wstr2wid (model, word1)) == NO_WORD)
 544                 E_FATAL("Unknown word: %s\n", word1);
 545             w1 --; // hash value is 1-base
 546             if ((w2 = lm3g_wstr2wid (model, word2)) == NO_WORD)
 547                 E_FATAL("Unknown word: %s\n", word2);
 548             w2 --; // hash value is 1-base
 549         } else {
 550             if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w1 < 0) || (w2 < 0))
 551                 E_FATAL("Bad bigram: %s", string);
 552         }
 553
 554         /* HACK!! to quantize probs to 4 decimal digits */
 555         p = p2*10000;
 556         p2 = p*0.0001;
 557         p = bo_wt*10000;
 558         bo_wt = p*0.0001;
 559
 560         if (bgcount >= model->bcount)
 561             E_FATAL("Too many bigrams\n");
 562
 563         bgptr->wid = w2;
 564         bgptr->prob2 = sorted_id (&model->sorted_prob2, &p2);
 565         if (model->tcount > 0)
 566             bgptr->bo_wt2 = sorted_id (&model->sorted_bo_wt2, &bo_wt);
 567
 568         if (w1 != prev_w1) {
 569             if (w1 < prev_w1)
 570               E_FATAL("Bigrams not in unigram order\n");
 571
 572             for (prev_w1++; prev_w1 <= w1; prev_w1++)
 573                 model->unigrams[prev_w1].bigrams = bgcount;
 574             prev_w1 = w1;
 575         }
 576
 577         bgcount++;
 578         bgptr++;
 579
 580         if ((bgcount & 0x0000ffff) == 0) {
 581             E_INFOCONT (".");
 582         }
 583     }
 584     if ((strcmp (string, "\\end\\\n") != 0) && (strcmp (string, "\\3-grams:\n") != 0))
 585         E_FATAL("Bad bigram: %s\n", string);
 586
 587     for (prev_w1++; prev_w1 <= model->ucount; prev_w1++)
 588         model->unigrams[prev_w1].bigrams = bgcount;
 589 }
 590
 591 /*
 592  * Very similar to ReadBigrams.
 593  */
 594 static void ReadTrigrams (FILE *fp, lm_t *model, int32 idfmt)
 595 {
 596     char string[1024], word1[256], word2[256], word3[256];
 597     int32 i, n, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg, p;
 598     int32 seg, prev_seg, prev_seg_lastbg;
 599     trigram_t *tgptr;
 600     bigram_t *bgptr;
 601     float p3;
 602
 603     E_INFO ("Reading trigrams\n");
 604
 605     tgcount = 0;
 606     tgptr = model->trigrams;
 607     prev_w1 = -1;
 608     prev_w2 = -1;
 609     prev_bg = -1;
 610     prev_seg = -1;
 611
 612     while (fgets (string, sizeof(string), fp) != NULL) {
 613         if (! idfmt)
 614             n = sscanf (string, "%f %s %s %s", &p3, word1, word2, word3);
 615         else
 616             n = sscanf (string, "%f %d %d %d", &p3, &w1, &w2, &w3);
 617         if (n != 4) {
 618             if (string[0] != '\n')
 619                 break;
 620             continue;
 621         }
 622
 623         if (! idfmt) {
 624             if ((w1 = lm3g_wstr2wid (model, word1)) == NO_WORD)
 625                 E_FATAL("Unknown word: %s\n", word1);
 626             w1 --; // hash value is 1-base
 627             if ((w2 = lm3g_wstr2wid (model, word2)) == NO_WORD)
 628                 E_FATAL("Unknown word: %s\n", word2);
 629             w2 --; // hash value is 1-base
 630             if ((w3 = lm3g_wstr2wid (model, word3)) == NO_WORD)
 631                 E_FATAL("Unknown word: %s\n", word3);
 632             w3 --; // hash value is 1-base
 633         } else {
 634             if ((w1 >= model->ucount) || (w2 >= model->ucount) || (w3 >= model->ucount) ||
 635                     (w1 < 0) || (w2 < 0) || (w3 < 0))
 636                 E_FATAL("Bad trigram: %s", string);
 637         }
 638
 639         /* HACK!! to quantize probs to 4 decimal digits */
 640         p = p3*10000;
 641         p3 = p*0.0001;
 642
 643         if (tgcount >= model->tcount)
 644           E_FATAL("Too many trigrams\n");
 645
 646         tgptr->wid = w3;
 647         tgptr->prob3 = sorted_id (&model->sorted_prob3, &p3);
 648
 649         if ((w1 != prev_w1) || (w2 != prev_w2)) {
 650             /* Trigram for a new bigram; update tg info for all previous bigrams */
 651             if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2)))
 652               E_FATAL("Trigrams not in bigram order\n");
 653
 654             bg = (w1 != prev_w1) ? model->unigrams[w1].bigrams : prev_bg+1;
 655             endbg = model->unigrams[w1+1].bigrams;
 656             bgptr = model->bigrams + bg;
 657             for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
 658             if (bg >= endbg)
 659                 E_FATAL("Missing bigram for trigram: %s", string);
 660
 661             /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
 662             seg = bg >> LOG_BG_SEG_SZ;
 663             for (i = prev_seg+1; i <= seg; i++)
 664                 model->tseg_base[i] = tgcount;
 665
 666             /* Update trigrams pointers for all bigrams until bg */
 667             if (prev_seg < seg) {
 668                 int32 tgoff = 0;
 669
 670                 if (prev_seg >= 0) {
 671                     tgoff = tgcount - model->tseg_base[prev_seg];
 672                     if (tgoff > 65535)
 673                       E_FATAL("Offset from tseg_base > 65535\n");
 674                 }
 675
 676                 prev_seg_lastbg = ((prev_seg+1) << LOG_BG_SEG_SZ) - 1;
 677                 bgptr = model->bigrams + prev_bg;
 678                 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg; prev_bg++, bgptr++)
 679                     bgptr->trigrams = tgoff;
 680
 681                 for (; prev_bg <= bg; prev_bg++, bgptr++)
 682                     bgptr->trigrams = 0;
 683             } else {
 684                 int32 tgoff;
 685
 686                 tgoff = tgcount - model->tseg_base[prev_seg];
 687                 if (tgoff > 65535)
 688                   E_FATAL("Offset from tseg_base > 65535\n");
 689
 690                 bgptr = model->bigrams + prev_bg;
 691                 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
 692                     bgptr->trigrams = tgoff;
 693             }
 694
 695             prev_w1 = w1;
 696             prev_w2 = w2;
 697             prev_bg = bg;
 698             prev_seg = seg;
 699         }
 700
 701         tgcount++;
 702         tgptr++;
 703
 704         if ((tgcount & 0x0000ffff) == 0) {
 705             E_INFOCONT (".");
 706         }
 707     }
 708     if (strcmp (string, "\\end\\\n") != 0)
 709         E_FATAL("Bad trigram: %s\n", string);
 710
 711     for (prev_bg++; prev_bg <= model->bcount; prev_bg++) {
 712         if ((prev_bg & (BG_SEG_SZ-1)) == 0)
 713             model->tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
 714         if ((tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535)
 715             E_FATAL("Offset from tseg_base > 65535\n");
 716         model->bigrams[prev_bg].trigrams =
 717             tgcount - model->tseg_base[prev_bg >> LOG_BG_SEG_SZ];
 718     }
 719 }
 720
 721 static FILE *lm_file_open (char const *filename, int32 usepipe)
 722 {
 723     char command[1024];
 724     FILE *fp;
 725
 726     if (usepipe) {
 727 #ifdef WIN32
 728         sprintf (command, "D:\\compress\\gzip.exe -d -c %s", filename);
 729         if ((fp = _popen (command, "r")) == NULL)
 730             E_FATAL("Cannot popen %s\n", command);
 731 #else
 732         sprintf (command, "zcat %s", filename);
 733         if ((fp = popen (command, "r")) == NULL)
 734             E_FATAL("Cannot popen %s\n", command);
 735 #endif
 736     }
 737     else {
 738         fp = CM_fopen (filename, "r");
 739     }
 740     return (fp);
 741 }
 742
 743 /*
 744  * Read in a trigram language model from the given file.  The LM tokens can be word
 745  * classes.  However a given actual word can belong to AT MOST ONE of the LM classes
 746  * used by this LM.
 747  */
 748 lm_t* lm3g_read (char const *filename)
 749 {
 750     lm_t *model;
 751     FILE *fp = NULL;
 752     int32 usingPipe = FALSE;
 753     int32 n_unigram;
 754     int32 n_bigram;
 755     int32 n_trigram;
 756     int32 dict_size;
 757     int32 i, j, k, last_bg, last_tg;
 758     struct stat statbuf;
 759     int32 idfmt;
 760
 761     E_INFO ("Reading LM file %s\n", filename);
 762
 763     /* Check if a compressed file */
 764     k = strlen(filename);
 765 #ifdef WIN32
 766     usingPipe = (k > 3) &&
 767         ((strcmp (filename+k-3, ".gz") == 0) || (strcmp (filename+k-3, ".GZ") == 0));
 768 #else
 769     usingPipe = (k > 2) &&
 770         ((strcmp (filename+k-2, ".Z") == 0) || (strcmp (filename+k-2, ".z") == 0));
 771 #endif
 772     /* Check if an .arpabo-id format file; More HACK!! Hardwired check for -id */
 773     if (usingPipe)
 774         k -= 2;
 775     idfmt = ((k > 3) && (strncmp (filename+k-3, "-id", 3) == 0));
 776
 777     fp = lm_file_open (filename, usingPipe);
 778     if (stat(filename, &statbuf) < 0)
 779       E_FATAL("stat(%s) failed\n", filename);
 780
 781     /* Read #unigrams, #bigrams, #trigrams from file */
 782     ReadNgramCounts (fp, &n_unigram, &n_bigram, &n_trigram);
 783     E_INFO ("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
 784
 785     /* Determine dictionary size (for dict-wid -> LM-wid map) */
 786     dict_size = n_unigram;
 787
 788     if (dict_size >= 65535)
 789         E_FATAL("#dict-words(%d) > 65534\n", dict_size);
 790
 791     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
 792     model = NewModel (n_unigram, n_bigram, n_trigram);
 793     model->word_str = (char **) CM_calloc (n_unigram, sizeof (char *));
 794     model->n_word_str = n_unigram;
 795
 796     /* Load the precompiled binary dump form of the Darpa LM file if it exists */
 797     if (1) {
 798         ReadUnigrams (fp, model);
 799         E_INFO("%8d = #unigrams created\n", model->ucount);
 800
 801         init_sorted_list (&model->sorted_prob2);
 802         if (model->tcount > 0)
 803             init_sorted_list (&model->sorted_bo_wt2);
 804
 805         ReadBigrams (fp, model, idfmt);
 806
 807         model->bcount = FIRST_BG(model,model->ucount);
 808         model->n_prob2 = model->sorted_prob2.free;
 809         model->prob2  = vals_in_sorted_list (&model->sorted_prob2);
 810         free_sorted_list (&model->sorted_prob2);
 811         E_INFO("\n%8d = #bigrams created\n", model->bcount);
 812         E_INFO("%8d = #prob2 entries\n", model->n_prob2);
 813
 814         if (model->tcount > 0) {
 815             /* Create trigram bo-wts array */
 816             model->n_bo_wt2 = model->sorted_bo_wt2.free;
 817             model->bo_wt2 = vals_in_sorted_list (&model->sorted_bo_wt2);
 818             free_sorted_list (&model->sorted_bo_wt2);
 819             E_INFO("%8d = #bo_wt2 entries\n", model->n_bo_wt2);
 820
 821             init_sorted_list (&model->sorted_prob3);
 822
 823             ReadTrigrams (fp, model, idfmt);
 824
 825             model->tcount = FIRST_TG(model,model->bcount);
 826             model->n_prob3 = model->sorted_prob3.free;
 827             model->prob3  = vals_in_sorted_list (&model->sorted_prob3);
 828             E_INFO("\n%8d = #trigrams created\n", model->tcount);
 829             E_INFO("%8d = #prob3 entries\n", model->n_prob3);
 830
 831             free_sorted_list (&model->sorted_prob3);
 832         }
 833     }
 834
 835     fclose (fp);
 836
 837     /*
 838      * Discourage expansion of end_sym and transition to start_sym.  (The given
 839      * Darpa LM may contain some spurious values that don't reflect these
 840      * requirements.)
 841      */
 842     /* bo_wt(</s>) = MIN_PROB_F */
 843     for (i = 0; (i < model->ucount) && (strcmp (model->word_str[i], end_sym) != 0); i++);
 844     E_INFO("bo_wt(%s) changed from %.4f to %.4f\n",
 845            model->word_str[i], model->unigrams[i].bo_wt1.f, MIN_PROB_F);
 846     model->unigrams[i].bo_wt1.f = MIN_PROB_F;
 847
 848     /* unigram prob(<s>) = MIN_PROB_F */
 849     for (i = 0; (i < model->ucount) && (strcmp (model->word_str[i], start_sym) != 0); i++);
 850     E_INFO("prob(%s) changed from %.4f to %.4f\n",
 851            model->word_str[i], model->unigrams[i].prob1.f, MIN_PROB_F);
 852     model->unigrams[i].prob1.f = MIN_PROB_F;
 853
 854     /* bigram prob(<s>,<s>) = MIN_PROB_F (if bigram exists) */
 855     j = FIRST_BG(model,i);
 856     last_bg = LAST_BG(model,i);
 857     for (; (j<=last_bg) && (strcmp(model->word_str[BG_WID(model,j)],start_sym)!=0); j++);
 858     if (j <= last_bg) {
 859         E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
 860                model->word_str[i], model->word_str[BG_WID(model,j)],
 861                model->prob2[model->bigrams[j].prob2].f,
 862                model->prob2[0].f);
 863         model->bigrams[j].prob2 = 0;
 864
 865         if (model->tcount > 0) {
 866             /* trigram prob(<s>,<s>,<s>) = MIN_PROB_F (if trigram exists) */
 867             k = FIRST_TG(model,j);
 868             last_tg = LAST_TG(model,j);
 869             for (; k <= last_tg; k++) {
 870                 if (strcmp (model->word_str[TG_WID(model,k)], start_sym) == 0)
 871                     break;
 872             }
 873             if (k <= last_tg) {
 874                 E_INFO("prob(%s,%s,%s) changed from %.4f to %.4f\n",
 875                        model->word_str[i], model->word_str[BG_WID(model,j)], model->word_str[TG_WID(model,k)],
 876                        model->prob3[model->trigrams[k].prob3].f,
 877                        model->prob3[0].f);
 878                 model->trigrams[k].prob3 = 0;
 879             }
 880         }
 881     }
 882
 883     /* bigram prob(<s>,</s>) = MIN_PROB_F (if bigram exists) */
 884     j = FIRST_BG(model,i);
 885     last_bg = LAST_BG(model,i);
 886     for (; (j<=last_bg) && (strcmp(model->word_str[BG_WID(model,j)],end_sym)!=0); j++);
 887     if (j <= last_bg) {
 888         E_INFO("prob(%s,%s) changed from %.4f to %.4f\n",
 889                model->word_str[i], model->word_str[BG_WID(model,j)],
 890                model->prob2[model->bigrams[j].prob2].f,
 891                model->prob2[0].f);
 892         model->bigrams[j].prob2 = 0;
 893     }
 894
 895     return model;
 896 }
 897
 898 void lm3g_free (lm_t *model)
 899 {
 900     int32 i,u;
 901     tginfo_t *tginfo, *next_tginfo;
 902
 903     hash_free (&(model->HT));
 904     for (i = 0; i < model->ucount; i++)
 905         free (model->word_str[i]);
 906     free (model->word_str);
 907
 908     free (model->unigrams);
 909     free (model->bigrams);
 910     free (model->prob2);
 911     if (model->tcount > 0) {
 912         free (model->trigrams);
 913         free (model->tseg_base);
 914         free (model->bo_wt2);
 915         free (model->prob3);
 916     }
 917     if (model->HT.tab != NULL)
 918         hash_free (&model->HT);
 919
 920     if (model->tginfo) {
 921         for (u = 0; u < model->max_ucount; u++)
 922             for (tginfo = model->tginfo[u]; tginfo; tginfo = next_tginfo) {
 923                 next_tginfo = tginfo->next;
 924                 listelem_free ((void *)tginfo, sizeof(tginfo_t));
 925             }
 926         free (model->tginfo);
 927     }
 928
 929     free (model);
 930 }
 931
 932 void lmSetStartSym (char const *sym)
 933 /*----------------------------*
 934  * Description - reconfigure the start symbol
 935  */
 936 {
 937     start_sym = (char *) salloc(sym);
 938 }
 939
 940 void lmSetEndSym (char const *sym)
 941 /*----------------------------*
 942  * Description - reconfigure the end symbol
 943  */
 944 {
 945     end_sym = (char *) salloc(sym);
 946 }
 947
 948 #ifdef NO_DICT
 949 main (argc, argv)
 950     int32 argc;
 951     char *argv[];
 952 {
 953     lm_t *model;
 954     int32 i, n, score;
 955     float lw, uw, wip;
 956     char wd[3][100], line[1000];
 957     int32 wid[3];
 958     int32 unkwid;
 959
 960     if (argc < 4)
 961         E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
 962     if (sscanf(argv[1], "%f", &lw) != 1)
 963         E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
 964     if (sscanf(argv[2], "%f", &uw) != 1)
 965         E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
 966     if (sscanf(argv[3], "%f", &wip) != 1)
 967         E_FATAL("Usage: %s <lw> <uw> <wip> [LM-file]\n", argv[0]);
 968     if (argc == 5)
 969         lm_read(argv[4], lw, uw, wip, 1.0);
 970     else
 971         lm_read("/net/alf8/cdrom/wsj1/grammar/tgboc20o.nvp", lw, uw, wip, 1.0);
 972
 973     lmSetParameters (model, 0);
 974
 975     unkwid = lm3g_wstr2wid (model, "<UNK>");
 976     unkwid --; // hash value is 1-base
 977     for (;;) {
 978         printf ("Enter 1, 2, or 3 words: ");
 979         fgets (line, sizeof(line), stdin);
 980         if (((n = sscanf (line, "%s %s %s", wd[0], wd[1], wd[2])) < 1) || (n > 3))
 981             break;
 982         for (i = 0; i < n; i++)
 983             if ((wid[i] = lm3g_wstr2wid (model, wd[i])) == NO_WORD) {
 984                 printf ("  %s -> <UNK>\n", wd[i]);
 985                 wid[i] = unkwid;
 986             }
 987             else
 988                 wid[i] --;
 989
 990         switch (n) {
 991         case 1:
 992             score = unigram_score (wid[0]);
 993             printf ("unigram_score(%d) = %d\n",
 994                     wid[0], score);
 995             break;
 996         case 2:
 997             score = bigram_score (wid[0], wid[1]);
 998             printf ("bigram_score(%d, %d) = %d\n",
 999                     wid[0], wid[1], score);
1000             break;
1001         case 3:
1002             score = trigram_score (wid[0], wid[1], wid[2]);
1003             printf ("trigram_score(%d, %d, %d) = %d\n",
1004                     wid[0], wid[1], wid[2], score);
1005             break;
1006         default:
1007             break;
1008         }
1009     }
1010 }
1011 #endif
1012
1013 #define BINARY_SEARCH_THRESH    16
1014
1015 float lm3g_ug_score (lm_t *lm, int32 wid)
1016 {
1017     if (wid >= lm->ucount)
1018         E_FATAL("dictwid[%d] not in LM\n", wid);
1019
1020     return (lm->unigrams[wid].prob1.f);
1021 }
1022
1023 /* Locate a specific bigram within a bigram list */
1024 static int32 find_bg (bigram_t *bg, int32 n, int32 w)
1025 {
1026     int32 i, b, e;
1027
1028     /* Binary search until segment size < threshold */
1029     b = 0;
1030     e = n;
1031     while (e-b > BINARY_SEARCH_THRESH) {
1032         i = (b+e)>>1;
1033         if (bg[i].wid < w)
1034             b = i+1;
1035         else if (bg[i].wid > w)
1036             e = i;
1037         else
1038             return i;
1039     }
1040
1041     /* Linear search within narrowed segment */
1042     for (i = b; (i < e) && (bg[i].wid != w); i++);
1043     return ((i < e) ? i : -1);
1044 }
1045
1046 /* w1, w2 are dictionary (base-)word ids */
1047 float lm3g_bg_score (lm_t *lm, int32 w1, int32 w2)
1048 {
1049     int32 i, n, b;
1050     float score;
1051     bigram_t *bg;
1052
1053     /* lm->n_bg_score++; */
1054
1055     if (w1 >= lm->ucount)
1056         E_FATAL("dictwid[%d] not in LM\n", w1);
1057     if (w2 >= lm->ucount)
1058         E_FATAL("dictwid[%d] not in LM\n", w2);
1059
1060     b = FIRST_BG(lm, w1);
1061     n = FIRST_BG(lm, w1+1) - b;
1062     bg = lm->bigrams + b;
1063
1064     if ((i = find_bg (bg, n, w2)) >= 0) {
1065         score = lm->prob2[bg[i].prob2].f;
1066     } else {
1067         /* lm->n_bg_bo++; */
1068         score = lm->unigrams[w1].bo_wt1.f + lm->unigrams[w2].prob1.f;
1069     }
1070
1071     return (score);
1072 }
1073
1074 static void load_tginfo (lm_t *lm, int32 lw1, int32 lw2)
1075 {
1076     int32 i, n, b, t;
1077     bigram_t *bg;
1078     tginfo_t *tginfo;
1079
1080     /* First allocate space for tg information for bg lw1,lw2 */
1081     tginfo = (tginfo_t *) listelem_alloc (sizeof(tginfo_t));
1082     tginfo->w1 = lw1;
1083     tginfo->tg = NULL;
1084     tginfo->next = lm->tginfo[lw2];
1085     lm->tginfo[lw2] = tginfo;
1086
1087     /* Locate bigram lw1,lw2 */
1088
1089     b = lm->unigrams[lw1].bigrams;
1090     n = lm->unigrams[lw1+1].bigrams - b;
1091     bg = lm->bigrams + b;
1092
1093     if ((n > 0) && ((i = find_bg (bg, n, lw2)) >= 0)) {
1094         tginfo->bowt = lm->bo_wt2[bg[i].bo_wt2].f;
1095
1096         /* Find t = Absolute first trigram index for bigram lw1,lw2 */
1097         b += i;                 /* b = Absolute index of bigram lw1,lw2 on disk */
1098         t = FIRST_TG(lm, b);
1099
1100         tginfo->tg = lm->trigrams + t;
1101
1102         /* Find #tg for bigram w1,w2 */
1103         tginfo->n_tg = FIRST_TG(lm, b+1) - t;
1104     } else {                    /* No bigram w1,w2 */
1105         tginfo->bowt = 0;
1106         tginfo->n_tg = 0;
1107     }
1108 }
1109
1110 /* Similar to find_bg */
1111 static int32 find_tg (trigram_t *tg, int32 n, int32 w)
1112 {
1113     int32 i, b, e;
1114
1115     b = 0;
1116     e = n;
1117     while (e-b > BINARY_SEARCH_THRESH) {
1118         i = (b+e)>>1;
1119         if (tg[i].wid < w)
1120             b = i+1;
1121         else if (tg[i].wid > w)
1122             e = i;
1123         else
1124             return i;
1125     }
1126
1127     for (i = b; (i < e) && (tg[i].wid != w); i++);
1128     return ((i < e) ? i : -1);
1129 }
1130
1131 /* w1, w2, w3 are dictionary wids */
1132 float lm3g_tg_score (lm_t *lm, int32 w1, int32 w2, int32 w3)
1133 {
1134     int32 i, n;
1135     float score;
1136     trigram_t *tg;
1137     tginfo_t *tginfo, *prev_tginfo;
1138
1139     if ((lm->tcount <= 0) || (w1 < 0))
1140         return (lm3g_bg_score (lm, w2, w3));
1141
1142     /* lm->n_tg_score++; */
1143
1144     if (w1 >= lm->ucount)
1145         E_FATAL("dictwid[%d] not in LM\n", w1);
1146     if (w2 >= lm->ucount)
1147         E_FATAL("dictwid[%d] not in LM\n", w2);
1148     if (w3 >= lm->ucount)
1149         E_FATAL("dictwid[%d] not in LM\n", w3);
1150
1151     prev_tginfo = NULL;
1152     for (tginfo = lm->tginfo[w2]; tginfo; tginfo = tginfo->next) {
1153         if (tginfo->w1 == w1)
1154             break;
1155         prev_tginfo = tginfo;
1156     }
1157
1158     if (! tginfo) {
1159         load_tginfo (lm, w1, w2);
1160         tginfo = lm->tginfo[w2];
1161     } else if (prev_tginfo) {
1162         prev_tginfo->next = tginfo->next;
1163         tginfo->next = lm->tginfo[w2];
1164         lm->tginfo[w2] = tginfo;
1165     }
1166
1167     tginfo->used = 1;
1168
1169     /* Trigrams for w1,w2 now pointed to by tginfo */
1170     n = tginfo->n_tg;
1171     tg = tginfo->tg;
1172     if ((i = find_tg (tg, n, w3)) >= 0) {
1173         score = lm->prob3[tg[i].prob3].f;
1174     } else {
1175         /* lm->n_tg_bo++; */
1176         score = tginfo->bowt + lm3g_bg_score(lm, w2, w3);
1177     }
1178
1179     return (score);
1180 }
1181
1182 int lm3g_word_str_size(lm_t *lm)
1183 {
1184     return lm->n_word_str;
1185 }