Bug 468575 - Scrape some gunk off the config/ grout, r=ted
[wine-gecko.git] / intl / lwbrk / src / rulebrk.c
blobc98d52188f70294729443d5f47de3b1b724eef25
1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
12 * License.
14 * The Original Code is LibInThai.
16 * The Initial Developer of the Original Code is
17 * Samphan Raruenrom.
18 * Portions created by the Initial Developer are Copyright (C) 1998
19 * the Initial Developer. All Rights Reserved.
21 * Contributor(s):
23 * Alternatively, the contents of this file may be used under the terms of
24 * either of the GNU General Public License Version 2 or later (the "GPL"),
25 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 * in which case the provisions of the GPL or the LGPL are applicable instead
27 * of those above. If you wish to allow use of your version of this file only
28 * under the terms of either the GPL or the LGPL, and not to allow others to
29 * use your version of this file under the terms of the MPL, indicate your
30 * decision by deleting the provisions above and replace them with the notice
31 * and other provisions required by the GPL or the LGPL. If you do not delete
32 * the provisions above, a recipient may use your version of this file under
33 * the terms of any one of the MPL, the GPL or the LGPL.
35 * ***** END LICENSE BLOCK ***** */
36 #define TH_UNICODE
38 #include <stdlib.h>
39 #include <assert.h>
40 #include "th_char.h"
41 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
42 #define th_isspace(c) ((c)==' '||(c)=='\t')
46 /////////////////////////////////////////////////
47 // Thai character type array
50 typedef unsigned short twb_t;
51 extern const twb_t _TwbType[0x100-0xa0];
54 // bit definition
57 #define VRS 0x0001
58 #define VRE 0x0002
59 #define VRX 0x0004
61 #define VRA 0x0008
63 #define VLA 0x0010
64 #define VLO 0x0020
65 #define VLI 0x0040
67 #define VC 0x0080
69 #define CC 0x0100
70 #define CS 0x0200
72 #define C2 0x0400
73 #define CHB 0x0800
74 #define CHE 0x1000
76 #define MT 0x2000
78 //_#define me 0x2000
80 #define M 0x4000
82 #define T 0x8000
84 #define VL (VLA|VLO|VLI)
85 #define VR (VRS|VRE|VRX)
86 #define NE (VL|VRS)
87 #define NB (VR|M)
88 #define V (VL|VR)
89 #define CX (CC|CS)
90 #define C (CX|VC)
91 #define A (C|V|M)
93 #define twbtype(c) (_TwbType[th_zcode(c)])
95 #ifndef TRUE
96 #define TRUE 1
97 #define FALSE 0
98 #endif
99 #define RETURN(b) return (b)
103 /////////////////////////////////////////////////
106 int TrbWordBreakPos(const th_char *pstr, int left,
107 const th_char *rstr, int right)
108 /* const ThBreakIterator *it, const th_char **p)*/
111 //int left, right;
112 //const th_char *s = *p;
114 const th_char *lstr = pstr + left;
115 th_char _c[6];
116 twb_t _t[6];
117 #define c(i) (_c[(i)+3])
118 #define t(i) (_t[(i)+3])
119 int i, j;
122 //left = s - it->begin;
124 if(left < 0) return -1;
126 //right = (it->end == NULL) ? 4 : it->begin - s;
128 if(right < 1) return -1;
131 // get c(0), t(0)
133 c(0) = rstr[0]; /* may be '\0' */
134 if(!th_isthai(c(0))) return -1;
135 t(0) = twbtype(c(0));
136 if(!(t(0) & A)) return -1;
139 // get c(-1), t(-1)
141 if(left >= 1) {
142 c(-1) = lstr[-1];
143 if(!th_isthai(c(-1))) return 0;
144 t(-1) = twbtype(c(-1));
145 if(!(t(-1) & A)) return 0; /* handle punctuation marks here */
146 } else { c(-1) = 0; t(-1) = 0; }
149 // get c(1..2), t(1..2)
151 for(i = 1; i <= 2; i++) {
152 if(i >= right) { c(i) = 0; t(i) = 0; }
153 else {
154 c(i) = rstr[i]; /* may be '\0'; */
155 if(!th_isthai(c(i))) right = i--;
156 else {
157 t(i) = twbtype(c(i));
158 if(!(t(i) & A)) right = i--;
163 // get c(-2..-3), t(-2..-3)
165 for(i = -2, j = -2; i >= -3 ; j--) {
166 if(j < -left) { c(i) = 0; t(i) = 0; i--; }
167 else {
168 c(i) = lstr[j];
169 if(!th_isthai(c(i))) left = 0;
170 else {
171 t(i) = (twb_t)(th_isthai(c(i)) ? twbtype(c(i)) : 0);
172 if(!(t(i) & A)) left = 0;
173 else {
174 if((t(i+1) & MT) && ((t(i) & VR) || (t(i+2) & VR))) {
175 c(i+1) = c(i); t(i+1) = t(i);
176 } else i--;
183 // prohibit the unlikely
185 if((t(-1) & C) && (t(0) & C)) {
186 if((t(-1) & CHE) || (t(0) & CHB)) return -1;
189 // special case : vlao, C/ sara_a|aa, !sara_a
191 if((t(-3) & (VLA|VLO)) && (t(-2) & C) && (c(0) != TH_SARA_A) &&
192 (c(-1) == TH_SARA_A || c(-0) == TH_SARA_AA)) return 0;
195 // prohibit break
197 if(t(0) & NB) return -1;
198 if(t(-1) & NE) return -1;
202 // apply 100% rules
204 if(t(-1) & VRE) {
205 if(c(-2) == TH_SARA_AA && c(-1) == TH_SARA_A) return 0;
206 return -1; /* usually too short syllable, part of word */
209 if(t(-2) & VRE) return -1;
211 if((t(0) & C) && (t(1) & (VR|MT)) && (c(2) != TH_THANTHAKHAT)) { /*?C, NB */
212 if((t(-1) & (VRS|VRX)) && c(1) == TH_SARA_I) return -1; /* exception */
213 if(t(-1) & (V|M)) return 0; /* !C/ C, NB */
214 if(t(-2) & VRS) return 0; /* VRS, C / C, NB */
215 if(!(t(0) & C2) && c(1) == TH_SARA_I) { /* / !C2 or /c, sara_i */
216 if(t(-2) & VRX) return 0; /* VRX, C / C, NB ? 100%? */
217 if(t(-2) & VC) return 0; /* VC, C / C, NB ? 100% */
220 if((t(-1) & VRX) && (t(0) & CC)) return 0; /* VRX/ CC */
221 if((t(-2) & VRS) && (t(-1) & C) && (t(0) & (V|M))) return 0;/* VRS, C/ !C */
224 if((t(0) & CX) && (t(1) & C2) && (c(2) != TH_THANTHAKHAT)) {
225 if((t(-2) & A) && (t(-1) & CX)) return 0; /* A, CX / CX, C2 */
226 if((t(-2) & CX) && (t(-1) & MT)) return 0; /* CX, MT / CX, C2 */
229 // apply 90% rules
231 if(t(0) & VL) return 0;
232 if(t(1) & VL) return -1;
233 if(c(-1) == TH_THANTHAKHAT && c(-2) != TH_RORUA && c(-2) != TH_LOLING) return 0;
236 //return -1;
237 // apply 80% rules
239 if(t(0) & CHE) {
240 if((t(-2) & VRS) && (t(-1) & C)) return 0; /* VRS, C/ CHE */
241 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
242 if(t(-1) & VC) return 0; /* VC/ CHE */
244 if(t(-1) & CHB) {
245 if((t(0) & C) && (t(1) & VR)) return 0; /* CHB/ CC, VR */
246 if(t(0) & VC) return 0; /* CHB/ VC */
249 if((t(-2) & VL) && (t(1) & VR)) { /* VL, C? C, VR */
250 if(t(-2) & VLI) return 0; /* VLI,C/C,VR .*/
251 else { /* vlao, C ? C , VR */
252 if(c(1) == TH_SARA_A) return 2; /* vlao, C, C, sara_a/ */
253 if(t(-2) & VLO) return 0; /* VLO, C/ C, !sara_a */
254 if(!(t(1) & VRA)) return 0; /* VLA, C/ C, !vca */
257 /* C,MT,C */
258 if((t(-2) & C) && (t(-1) & MT) && (t(0) & CX)) return 1;
260 return -1;
264 int TrbFollowing(const th_char *begin, int length, int offset)
266 //(ThBreakIterator *this, int offset)
269 const th_char *w = begin + offset;
270 const th_char *end = begin + length;
271 while(w < end && *w && !th_isthai(*w) && th_isspace(*w)) w++;
273 if(w < end && *w && !th_isthai(*w)) {
274 int english = FALSE;
275 while(w < end && *w && !th_isthai(*w) && !th_isspace(*w)) {
276 if(th_isalpha(*w)) english = TRUE;
277 w++;
279 if(english || w == end ||
280 (!th_isthai(*w) && th_isspace(*w))) return w - begin;
282 if(w == end || *w == 0 || !th_isthai(*w)) return w - begin;
283 w++;
284 if(w < end && *w && th_isthai(*w)) {
285 int brk = TrbWordBreakPos(begin, w-begin, w, end-w);
286 while (brk < 0) {
287 w++;
288 if(w == end || *w == 0 || !th_isthai(*w)) break;
289 brk = TrbWordBreakPos(begin, w-begin, w, end-w);
291 if (brk > 0) w += brk;
293 if(w < end && *w && !th_isthai(*w)) {
294 while(w < end && *w && !th_isthai(*w) &&
295 !th_isalpha(*w) && !th_isspace(*w)) w++;
297 return w - begin;
302 /////////////////////////////////////////////////
304 const twb_t _TwbType[0x100-0xa0] = {
305 #if 0
306 /* 80 € */ T,
307 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
308 /* 90 � */ T,
309 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
310 #endif
311 /* a0   */ 0,
312 /* a1 ¡ */ CS,
313 /* a2 ¢ */ CS | CHE,
314 /* a3 £ */ CC | CHE,
315 /* a4 ¤ */ CS | CHE,
316 /* a5 ¥ */ CC | CHE,
317 /* a6 ¦ */ CS,
318 /* a7 § */ CS | CHB,
319 /* a8 ¨ */ CS,
320 /* a9 © */ CC | CHE,
321 /* aa ª */ CS,
322 /* ab « */ CC | CHE,
323 /* ac ¬ */ CC | CHB | CHE,
324 /* ad ­ */ CS | CHB,
325 /* ae ® */ CS | CHB,
326 /* af ¯ */ CS | CHB,
327 /* b0 ° */ CS,
328 /* b1 ± */ CS | CHB | CHE,
329 /* b2 ² */ CS | CHB | CHE,
330 /* b3 ³ */ CS | CHB,
331 /* b4 ´ */ CS,
332 /* b5 µ */ CS,
333 /* b6 ¶ */ CS,
334 /* b7 · */ CS,
335 /* b8 ¸ */ CS,
336 /* b9 ¹ */ CS,
337 /* ba º */ CS,
338 /* bb » */ CS,
339 /* bc ¼ */ CC | CHE,
340 /* bd ½ */ CC | CHE,
341 /* be ¾ */ CS,
342 /* bf ¿ */ CS,
343 /* c0 À */ CS | CHE,
344 /* c1 Á */ CS,
345 /* c2 Â */ CS,
346 /* c3 Ã */ CS | C2 | CHE, /* ? add CHE */
347 /* c4 Ä */ VC | CHE,
348 /* c5 Å */ CS | C2,
349 /* c6 Æ */ VC | CHE,
350 /* c7 Ç */ VC | C2,
351 /* c8 È */ CS,
352 /* c9 É */ CS | CHB,
353 /* ca Ê */ CS | CHE,
354 /* cb Ë */ CC | CHE,
355 /* CC Ì */ CS | CHB | CHE,
356 /* cd Í */ VC,
357 /* ce Î */ CC | CHE,
358 /* cf Ï */ T,
359 /* d0 Ð */ VRE | VRA,
360 /* d1 Ñ */ VRS,
361 /* d2 Ò */ VRX | VRA,
362 /* d3 Ó */ VRE,
363 /* d4 Ô */ VRX | VRA,
364 /* d5 Õ */ VRX | VRA,
365 /* d6 Ö */ VRS,
366 /* d7 × */ VRS | VRA,
367 /* d8 Ø */ VRX,
368 /* d9 Ù */ VRX,
369 /* da Ú */ T,
370 /* db Û */ 0,
371 /* dc Ü */ 0,
372 /* dd Ý */ 0,
373 /* de Þ */ 0,
374 /* df ß */ T,
375 /* e0 à */ VLA,
376 /* e1 á */ VLO,
377 /* e2 â */ VLO,
378 /* e3 ã */ VLI,
379 /* e4 ä */ VLI,
380 /* e5 å */ VRE,
381 /* e6 æ */ M,
382 /* e7 ç */ M,
383 /* e8 è */ M | MT,
384 /* e9 é */ M | MT,
385 /* ea ê */ M | MT,
386 /* eb ë */ M | MT,
387 /* ec ì */ M,
388 /* ed í */ T,
389 /* ee î */ T,
390 /* ef ï */ T,
391 /* f0 ð */ T,
392 /* f1 ñ */ T,
393 /* f2 ò */ T,
394 /* f3 ó */ T,
395 /* f4 ô */ T,
396 /* f5 õ */ T,
397 /* f6 ö */ T,
398 /* f7 ÷ */ T,
399 /* f8 ø */ T,
400 /* f9 ù */ T,
401 /* fa ú */ T,
402 /* fb û */ T,
403 /* fc ü */ 0,
404 /* fd ý */ 0,
405 /* fe þ */ 0,
406 /* ff ’ */ 0