softcount: tolerate zero ngrams
[vspell.git] / libvspell / lm / lm_3g.h
blob2b3251cc90340c59f93fc06d6662daa7953ebd2a
1 /* ====================================================================
2 * Copyright (c) 1999-2001 Carnegie Mellon University. All rights
3 * reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
18 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
19 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
22 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * ====================================================================
35 * lm_3g.h - darpa standard trigram language model header file
37 * HISTORY
39 * $Log: lm_3g.h,v $
40 * Revision 1.8 2004/12/10 16:48:55 rkm
41 * Added continuous density acoustic model handling
44 * 28-Oct-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
45 * Added lm3g_access_type().
47 * 14-Apr-98 M K Ravishankar (rkm@cs) at Carnegie Mellon University
48 * Added lm3g_n_lm() and lm3g_index2name().
50 * 02-Apr-97 M K Ravishankar (rkm@cs) at Carnegie Mellon University
51 * Added lm3g_raw_score() and lm_t.invlw.
52 * Changed lm_{u,b,t}g_score to lm3g_{u,b,t}g_score.
54 * 01-Jul-95 M K Ravishankar (rkm@cs) at Carnegie Mellon University
55 * Added tginfo_t to help speed up find_bg() and find_tg() searches.
57 * Revision 1.1 2000/12/05 01:33:29 lenzo
58 * files added or moved
60 * Revision 1.1.1.1 2000/01/28 22:08:58 lenzo
61 * Initial import of sphinx2
64 * Revision 6.5 93/10/27 17:47:04 rkm
65 * *** empty log message ***
67 * Revision 6.4 93/10/15 15:02:39 rkm
68 * *** empty log message ***
72 #ifndef _LM_3G_H_
73 #define _LM_3G_H_
75 #include "s2types.h"
76 #include "hash.h"
78 /* log quantities represented in either floating or integer format */
79 typedef union {
80 float f;
81 int32 l;
82 } log_t;
85 typedef struct unigram_s {
86 int32 mapid; /* Mapping to dictionary, LM-class, or other id if any.
87 Without any such mapping, this id is just the index of the
88 unigram entry. If mapped, the ID is interpreted depending
89 on the range it falls into (see lm_3g.c); and it is -ve if
90 there is no applicable mapping. */
91 log_t prob1;
92 log_t bo_wt1;
93 int32 bigrams; /* index of 1st entry in lm_t.bigrams[] */
94 } unigram_t, UniGram, *UniGramPtr;
97 * To conserve space, bigram info is kept in many tables. Since the number
98 * of distinct values << #bigrams, these table indices can be 16-bit values.
99 * prob2 and bo_wt2 are such indices, but keeping trigram index is less easy.
100 * It is supposed to be the index of the first trigram entry for each bigram.
101 * But such an index cannot be represented in 16-bits, hence the following
102 * segmentation scheme: Partition bigrams into segments of BG_SEG_SZ
103 * consecutive entries, such that #trigrams in each segment <= 2**16 (the
104 * corresponding trigram segment). The bigram_t.trigrams value is then a
105 * 16-bit relative index within the trigram segment. A separate table--
106 * lm_t.tseg_base--has the index of the 1st trigram for each bigram segment.
108 #define BG_SEG_SZ 512 /* chosen so that #trigram/segment <= 2**16 */
109 #define LOG_BG_SEG_SZ 9
111 typedef struct bigram_s {
112 uint16 wid; /* Index of unigram entry for this. (NOT dictionary id.) */
113 uint16 prob2; /* index into array of actual bigram probs */
114 uint16 bo_wt2; /* index into array of actual bigram backoff wts */
115 uint16 trigrams; /* index of 1st entry in lm_t.trigrams[],
116 RELATIVE TO its segment base (see above) */
117 } bigram_t, BiGram, *BiGramPtr;
120 * As with bigrams, trigram prob info kept in a separate table for conserving
121 * memory space.
123 typedef struct trigram_s {
124 uint16 wid; /* Index of unigram entry for this. (NOT dictionary id.) */
125 uint16 prob3; /* index into array of actual trigram probs */
126 } trigram_t;
130 * The following trigram information cache eliminates most traversals of 1g->2g->3g
131 * tree to locate trigrams for a given bigram (lw1,lw2). The organization is optimized
132 * for locality of access (to the same lw1), given lw2.
134 typedef struct tginfo_s {
135 int32 w1; /* lw1 component of bigram lw1,lw2. All bigrams with
136 same lw2 linked together (see lm_t.tginfo). */
137 int32 n_tg; /* #tg for parent bigram lw1,lw2 */
138 int32 bowt; /* tg bowt for lw1,lw2 */
139 int32 used; /* whether used since last lm_reset */
140 trigram_t *tg; /* Trigrams for lw1,lw2 */
141 struct tginfo_s *next; /* Next lw1 with same parent lw2; NULL if none. */
142 } tginfo_t;
145 * Bigram probs and bo-wts, and trigram probs are kept in separate tables
146 * rather than within the bigram_t and trigram_t structures. These tables
147 * hold unique prob and bo-wt values, and can be < 64K long (see lm_3g.h).
148 * The following tree structure is used to construct these tables of unique
149 * values. Whenever a new value is read from the LM file, the sorted tree
150 * structure is searched to see if the value already exists, and inserted
151 * if not found.
153 typedef struct sorted_entry_s {
154 log_t val; /* value being kept in this node */
155 uint16 lower; /* index of another entry. All descendants down
156 this path have their val < this node's val.
157 0 => no son exists (0 is root index) */
158 uint16 higher; /* index of another entry. All descendants down
159 this path have their val > this node's val
160 0 => no son exists (0 is root index) */
161 } sorted_entry_t;
164 * The sorted list. list is a (64K long) array. The first entry is the
165 * root of the tree and is created during initialization.
167 typedef struct {
168 sorted_entry_t *list;
169 int32 free; /* first free element in list */
170 } sorted_list_t;
173 * The language model.
174 * Bigrams for each unigram are contiguous. Bigrams for unigram i+1 come
175 * immediately after bigrams for unigram i. So, no need for a separate count
176 * of bigrams/unigram; it is enough to know the 1st bigram for each unigram.
177 * But an extra dummy unigram entry needed at the end to terminate the last
178 * real entry.
179 * Similarly, trigrams for each bigram are contiguous and trigrams for bigram
180 * i+1 come immediately after trigrams for bigram i, and an extra dummy bigram
181 * entry is required at the end.
183 typedef struct lm_s {
184 unigram_t *unigrams;
185 bigram_t *bigrams; /* for entire LM */
186 trigram_t *trigrams;/* for entire LM */
187 log_t *prob2; /* table of actual bigram probs */
188 int32 n_prob2; /* prob2 size */
189 log_t *bo_wt2; /* table of actual bigram backoff weights */
190 int32 n_bo_wt2; /* bo_wt2 size */
191 log_t *prob3; /* table of actual trigram probs */
192 int32 n_prob3; /* prob3 size */
193 int32 *tseg_base; /* tseg_base[i>>LOG_BG_SEG_SZ] = index of 1st
194 trigram for bigram segment (i>>LOG_BG_SEG_SZ) */
195 int32 max_ucount; /* To which ucount can grow with dynamic addition of words */
196 int32 ucount; /* #unigrams in LM */
197 int32 bcount; /* #bigrams in entire LM */
198 int32 tcount; /* #trigrams in entire LM */
200 double lw; /* language weight */
201 double invlw; /* 1.0/language weight */
202 double uw; /* unigram weight */
203 int32 log_wip; /* word insertion penalty */
205 tginfo_t **tginfo; /* tginfo[lw2] is head of linked list of trigram information for
206 some cached subset of bigrams (*,lw2). */
208 hash_t HT; /* hash table for word-string->word-id map */
210 /* Words in LM; used only for building internal LM from LM file */
211 char **word_str;
212 int n_word_str;
214 /* Arrays of unique bigram probs and bo-wts, and trigram probs */
215 sorted_list_t sorted_prob2;
216 sorted_list_t sorted_bo_wt2;
217 sorted_list_t sorted_prob3;
218 } lm_t;
221 #define LM3G_ACCESS_ERR 0
222 #define LM3G_ACCESS_UG 1
223 #define LM3G_ACCESS_BG 2
224 #define LM3G_ACCESS_TG 3
227 /* ----Interface---- */
229 void lmSetStartSym (char const *sym);
230 void lmSetEndSym (char const *sym);
231 void lm3g_free (lm_t *model);
232 lm_t * NewModel (int32 n_ug, int32 n_bg, int32 n_tg);
233 lm_t * lm3g_read(char const *filename);
235 float lm3g_tg_score (lm_t *lm, int32 w1, int32 w2, int32 w3);
236 float lm3g_bg_score (lm_t *lm, int32 w1, int32 w2);
237 float lm3g_ug_score (lm_t *lm, int32 w);
238 float lm3g_raw_score (lm_t *lm, int32 score);
240 int lm3g_word_str_size(lm_t *lm);
241 int32 lm3g_wstr2wid (lm_t *model, const char *w);
242 #endif /* _LM_3G_H_ */