extensions/spellcheck/hunspell/src/csutil.hxx

   1 /******* BEGIN LICENSE BLOCK *******
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * The contents of this file are subject to the Mozilla Public License Version
   5  * 1.1 (the "License"); you may not use this file except in compliance with
   6  * the License. You may obtain a copy of the License at
   7  * http://www.mozilla.org/MPL/
   8  *
   9  * Software distributed under the License is distributed on an "AS IS" basis,
  10  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  11  * for the specific language governing rights and limitations under the
  12  * License.
  13  *
  14  * The Initial Developers of the Original Code are Kevin Hendricks (MySpell)
  15  * and László Németh (Hunspell). Portions created by the Initial Developers
  16  * are Copyright (C) 2002-2005 the Initial Developers. All Rights Reserved.
  17  *
  18  * Contributor(s): Kevin Hendricks (kevin.hendricks@sympatico.ca)
  19  *                 David Einstein (deinst@world.std.com)
  20  *                 László Németh (nemethl@gyorsposta.hu)
  21  *                 Davide Prina
  22  *                 Giuseppe Modugno
  23  *                 Gianluca Turconi
  24  *                 Simon Brouwer
  25  *                 Noll Janos
  26  *                 Biro Arpad
  27  *                 Goldman Eleonora
  28  *                 Sarlos Tamas
  29  *                 Bencsath Boldizsar
  30  *                 Halacsy Peter
  31  *                 Dvornik Laszlo
  32  *                 Gefferth Andras
  33  *                 Nagy Viktor
  34  *                 Varga Daniel
  35  *                 Chris Halls
  36  *                 Rene Engelhard
  37  *                 Bram Moolenaar
  38  *                 Dafydd Jones
  39  *                 Harri Pitkanen
  40  *                 Andras Timar
  41  *                 Tor Lillqvist
  42  *
  43  * Alternatively, the contents of this file may be used under the terms of
  44  * either the GNU General Public License Version 2 or later (the "GPL"), or
  45  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  46  * in which case the provisions of the GPL or the LGPL are applicable instead
  47  * of those above. If you wish to allow use of your version of this file only
  48  * under the terms of either the GPL or the LGPL, and not to allow others to
  49  * use your version of this file under the terms of the MPL, indicate your
  50  * decision by deleting the provisions above and replace them with the notice
  51  * and other provisions required by the GPL or the LGPL. If you do not delete
  52  * the provisions above, a recipient may use your version of this file under
  53  * the terms of any one of the MPL, the GPL or the LGPL.
  54  *
  55  ******* END LICENSE BLOCK *******/
  56
  57 #ifndef __CSUTILHXX__
  58 #define __CSUTILHXX__
  59
  60 // First some base level utility routines
  61
  62 #include "w_char.hxx"
  63
  64 // casing
  65 #define NOCAP   0
  66 #define INITCAP 1
  67 #define ALLCAP  2
  68 #define HUHCAP  3
  69 #define HUHINITCAP  4
  70
  71 // default encoding and keystring
  72 #define SPELL_ENCODING  "ISO8859-1"
  73 #define SPELL_KEYSTRING "qwertyuiop|asdfghjkl|zxcvbnm"
  74
  75 // default morphological fields
  76 #define MORPH_STEM        "st:"
  77 #define MORPH_ALLOMORPH   "al:"
  78 #define MORPH_POS         "po:"
  79 #define MORPH_DERI_PFX    "dp:"
  80 #define MORPH_INFL_PFX    "ip:"
  81 #define MORPH_TERM_PFX    "tp:"
  82 #define MORPH_DERI_SFX    "ds:"
  83 #define MORPH_INFL_SFX    "is:"
  84 #define MORPH_TERM_SFX    "ts:"
  85 #define MORPH_SURF_PFX    "sp:"
  86 #define MORPH_FREQ        "fr:"
  87 #define MORPH_PHON        "ph:"
  88 #define MORPH_HYPH        "hy:"
  89 #define MORPH_PART        "pa:"
  90 #define MORPH_FLAG        "fl:"
  91 #define MORPH_HENTRY      "_H:"
  92 #define MORPH_TAG_LEN     strlen(MORPH_STEM)
  93
  94 #define MSEP_FLD ' '
  95 #define MSEP_REC '\n'
  96 #define MSEP_ALT '\v'
  97
  98 // default flags
  99 #define DEFAULTFLAGS   65510
 100 #define FORBIDDENWORD  65510
 101 #define ONLYUPCASEFLAG 65511
 102
 103 // hash entry macros
 104 #define HENTRY_DATA(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \
 105     get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : NULL)
 106 // NULL-free version for warning-free OOo build
 107 #define HENTRY_DATA2(h) (h->var ? ((h->var & H_OPT_ALIASM) ? \
 108     get_stored_pointer(&(h->word) + h->blen + 1) : &(h->word) + h->blen + 1) : "")
 109 #define HENTRY_FIND(h,p) (HENTRY_DATA(h) ? strstr(HENTRY_DATA(h), p) : NULL)
 110
 111 #define w_char_eq(a,b) (((a).l == (b).l) && ((a).h == (b).h))
 112
 113 // convert UTF-16 characters to UTF-8
 114 char * u16_u8(char * dest, int size, const w_char * src, int srclen);
 115
 116 // convert UTF-8 characters to UTF-16
 117 int u8_u16(w_char * dest, int size, const char * src);
 118
 119 // sort 2-byte vector
 120 void flag_qsort(unsigned short flags[], int begin, int end);
 121
 122 // binary search in 2-byte vector
 123 int flag_bsearch(unsigned short flags[], unsigned short flag, int right);
 124
 125 // remove end of line char(s)
 126 void   mychomp(char * s);
 127
 128 // duplicate string
 129 char * mystrdup(const char * s);
 130
 131 // strcat for limited length destination string
 132 char * mystrcat(char * dest, const char * st, int max);
 133
 134 // duplicate reverse of string
 135 char * myrevstrdup(const char * s);
 136
 137 // parse into tokens with char delimiter
 138 char * mystrsep(char ** sptr, const char delim);
 139 // parse into tokens with char delimiter
 140 char * mystrsep2(char ** sptr, const char delim);
 141
 142 // parse into tokens with char delimiter
 143 char * mystrrep(char *, const char *, const char *);
 144
 145 // append s to ends of every lines in text
 146 void strlinecat(char * lines, const char * s);
 147
 148 // tokenize into lines with new line
 149    int line_tok(const char * text, char *** lines, char breakchar);
 150
 151 // tokenize into lines with new line and uniq in place
 152    char * line_uniq(char * text, char breakchar);
 153    char * line_uniq_app(char ** text, char breakchar);
 154
 155 // change oldchar to newchar in place
 156    char * tr(char * text, char oldc, char newc);
 157
 158 // reverse word
 159    int reverseword(char *);
 160
 161 // reverse word
 162    int reverseword_utf(char *);
 163
 164 // remove duplicates
 165  int uniqlist(char ** list, int n);
 166
 167 // free character array list
 168    void freelist(char *** list, int n);
 169
 170 // character encoding information
 171 struct cs_info {
 172   unsigned char ccase;
 173   unsigned char clower;
 174   unsigned char cupper;
 175 };
 176
 177 // Unicode character encoding information
 178 struct unicode_info {
 179   unsigned short c;
 180   unsigned short cupper;
 181   unsigned short clower;
 182 };
 183
 184 struct unicode_info2 {
 185   char cletter;
 186   unsigned short cupper;
 187   unsigned short clower;
 188 };
 189
 190 int initialize_utf_tbl();
 191 void free_utf_tbl();
 192 unsigned short unicodetoupper(unsigned short c, int langnum);
 193 unsigned short unicodetolower(unsigned short c, int langnum);
 194 int unicodeisalpha(unsigned short c);
 195
 196 struct enc_entry {
 197   const char * enc_name;
 198   struct cs_info * cs_table;
 199 };
 200
 201 // language to encoding default map
 202
 203 struct lang_map {
 204   const char * lang;
 205   const char * def_enc;
 206   int num;
 207 };
 208
 209 struct cs_info * get_current_cs(const char * es);
 210
 211 const char * get_default_enc(const char * lang);
 212
 213 // get language identifiers of language codes
 214 int get_lang_num(const char * lang);
 215
 216 // get characters of the given 8bit encoding with lower- and uppercase forms
 217 char * get_casechars(const char * enc);
 218
 219 // convert null terminated string to all caps using encoding
 220 void enmkallcap(char * d, const char * p, const char * encoding);
 221
 222 // convert null terminated string to all little using encoding
 223 void enmkallsmall(char * d, const char * p, const char * encoding);
 224
 225 // convert null terminated string to have intial capital using encoding
 226 void enmkinitcap(char * d, const char * p, const char * encoding);
 227
 228 // convert null terminated string to all caps
 229 void mkallcap(char * p, const struct cs_info * csconv);
 230
 231 // convert null terminated string to all little
 232 void mkallsmall(char * p, const struct cs_info * csconv);
 233
 234 // convert null terminated string to have intial capital
 235 void mkinitcap(char * p, const struct cs_info * csconv);
 236
 237 // convert first nc characters of UTF-8 string to little
 238 void mkallsmall_utf(w_char * u, int nc, int langnum);
 239
 240 // convert first nc characters of UTF-8 string to capital
 241 void mkallcap_utf(w_char * u, int nc, int langnum);
 242
 243 // get type of capitalization
 244 int get_captype(char * q, int nl, cs_info *);
 245
 246 // get type of capitalization (UTF-8)
 247 int get_captype_utf8(w_char * q, int nl, int langnum);
 248
 249 // strip all ignored characters in the string
 250 void remove_ignored_chars_utf(char * word, unsigned short ignored_chars[], int ignored_len);
 251
 252 // strip all ignored characters in the string
 253 void remove_ignored_chars(char * word, char * ignored_chars);
 254
 255 int parse_string(char * line, char ** out, int ln);
 256
 257 int parse_array(char * line, char ** out, unsigned short ** out_utf16,
 258     int * out_utf16_len, int utf8, int ln);
 259
 260 int fieldlen(const char * r);
 261 char * copy_field(char * dest, const char * morph, const char * var);
 262
 263 int morphcmp(const char * s, const char * t);
 264
 265 int get_sfxcount(const char * morph);
 266
 267 // conversion function for protected memory
 268 void store_pointer(char * dest, char * source);
 269
 270 // conversion function for protected memory
 271 char * get_stored_pointer(char * s);
 272
 273 #endif