1 /* ***** BEGIN LICENSE BLOCK *****
2 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 * The contents of this file are subject to the Mozilla Public License Version
5 * 1.1 (the "License"); you may not use this file except in compliance with
6 * the License. You may obtain a copy of the License at
7 * http://www.mozilla.org/MPL/
9 * Software distributed under the License is distributed on an "AS IS" basis,
10 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
11 * for the specific language governing rights and limitations under the
14 * The Original Code is LibInThai.
16 * The Initial Developer of the Original Code is
18 * Portions created by the Initial Developer are Copyright (C) 1998
19 * the Initial Developer. All Rights Reserved.
23 * Alternatively, the contents of this file may be used under the terms of
24 * either of the GNU General Public License Version 2 or later (the "GPL"),
25 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
26 * in which case the provisions of the GPL or the LGPL are applicable instead
27 * of those above. If you wish to allow use of your version of this file only
28 * under the terms of either the GPL or the LGPL, and not to allow others to
29 * use your version of this file under the terms of the MPL, indicate your
30 * decision by deleting the provisions above and replace them with the notice
31 * and other provisions required by the GPL or the LGPL. If you do not delete
32 * the provisions above, a recipient may use your version of this file under
33 * the terms of any one of the MPL, the GPL or the LGPL.
35 * ***** END LICENSE BLOCK ***** */
41 #define th_isalpha(c) (((c)>='a'&&(c)<='z')||((c)>='A'&&(c)<='Z'))
42 #define th_isspace(c) ((c)==' '||(c)=='\t')
46 /////////////////////////////////////////////////
47 // Thai character type array
50 typedef unsigned short twb_t
;
51 extern const twb_t _TwbType
[0x100-0xa0];
84 #define VL (VLA|VLO|VLI)
85 #define VR (VRS|VRE|VRX)
93 #define twbtype(c) (_TwbType[th_zcode(c)])
99 #define RETURN(b) return (b)
103 /////////////////////////////////////////////////
106 int TrbWordBreakPos(const th_char
*pstr
, int left
,
107 const th_char
*rstr
, int right
)
108 /* const ThBreakIterator *it, const th_char **p)*/
112 //const th_char *s = *p;
114 const th_char
*lstr
= pstr
+ left
;
117 #define c(i) (_c[(i)+3])
118 #define t(i) (_t[(i)+3])
122 //left = s - it->begin;
124 if(left
< 0) return -1;
126 //right = (it->end == NULL) ? 4 : it->begin - s;
128 if(right
< 1) return -1;
133 c(0) = rstr
[0]; /* may be '\0' */
134 if(!th_isthai(c(0))) return -1;
135 t(0) = twbtype(c(0));
136 if(!(t(0) & A
)) return -1;
143 if(!th_isthai(c(-1))) return 0;
144 t(-1) = twbtype(c(-1));
145 if(!(t(-1) & A
)) return 0; /* handle punctuation marks here */
146 } else { c(-1) = 0; t(-1) = 0; }
149 // get c(1..2), t(1..2)
151 for(i
= 1; i
<= 2; i
++) {
152 if(i
>= right
) { c(i
) = 0; t(i
) = 0; }
154 c(i
) = rstr
[i
]; /* may be '\0'; */
155 if(!th_isthai(c(i
))) right
= i
--;
157 t(i
) = twbtype(c(i
));
158 if(!(t(i
) & A
)) right
= i
--;
163 // get c(-2..-3), t(-2..-3)
165 for(i
= -2, j
= -2; i
>= -3 ; j
--) {
166 if(j
< -left
) { c(i
) = 0; t(i
) = 0; i
--; }
169 if(!th_isthai(c(i
))) left
= 0;
171 t(i
) = (twb_t
)(th_isthai(c(i
)) ? twbtype(c(i
)) : 0);
172 if(!(t(i
) & A
)) left
= 0;
174 if((t(i
+1) & MT
) && ((t(i
) & VR
) || (t(i
+2) & VR
))) {
175 c(i
+1) = c(i
); t(i
+1) = t(i
);
183 // prohibit the unlikely
185 if((t(-1) & C
) && (t(0) & C
)) {
186 if((t(-1) & CHE
) || (t(0) & CHB
)) return -1;
189 // special case : vlao, C/ sara_a|aa, !sara_a
191 if((t(-3) & (VLA
|VLO
)) && (t(-2) & C
) && (c(0) != TH_SARA_A
) &&
192 (c(-1) == TH_SARA_A
|| c(-0) == TH_SARA_AA
)) return 0;
197 if(t(0) & NB
) return -1;
198 if(t(-1) & NE
) return -1;
205 if(c(-2) == TH_SARA_AA
&& c(-1) == TH_SARA_A
) return 0;
206 return -1; /* usually too short syllable, part of word */
209 if(t(-2) & VRE
) return -1;
211 if((t(0) & C
) && (t(1) & (VR
|MT
)) && (c(2) != TH_THANTHAKHAT
)) { /*?C, NB */
212 if((t(-1) & (VRS
|VRX
)) && c(1) == TH_SARA_I
) return -1; /* exception */
213 if(t(-1) & (V
|M
)) return 0; /* !C/ C, NB */
214 if(t(-2) & VRS
) return 0; /* VRS, C / C, NB */
215 if(!(t(0) & C2
) && c(1) == TH_SARA_I
) { /* / !C2 or /c, sara_i */
216 if(t(-2) & VRX
) return 0; /* VRX, C / C, NB ? 100%? */
217 if(t(-2) & VC
) return 0; /* VC, C / C, NB ? 100% */
220 if((t(-1) & VRX
) && (t(0) & CC
)) return 0; /* VRX/ CC */
221 if((t(-2) & VRS
) && (t(-1) & C
) && (t(0) & (V
|M
))) return 0;/* VRS, C/ !C */
224 if((t(0) & CX
) && (t(1) & C2
) && (c(2) != TH_THANTHAKHAT
)) {
225 if((t(-2) & A
) && (t(-1) & CX
)) return 0; /* A, CX / CX, C2 */
226 if((t(-2) & CX
) && (t(-1) & MT
)) return 0; /* CX, MT / CX, C2 */
231 if(t(0) & VL
) return 0;
232 if(t(1) & VL
) return -1;
233 if(c(-1) == TH_THANTHAKHAT
&& c(-2) != TH_RORUA
&& c(-2) != TH_LOLING
) return 0;
240 if((t(-2) & VRS
) && (t(-1) & C
)) return 0; /* VRS, C/ CHE */
241 /*if(t(-1) & VRX) return 0; // VRX/ CHE */
242 if(t(-1) & VC
) return 0; /* VC/ CHE */
245 if((t(0) & C
) && (t(1) & VR
)) return 0; /* CHB/ CC, VR */
246 if(t(0) & VC
) return 0; /* CHB/ VC */
249 if((t(-2) & VL
) && (t(1) & VR
)) { /* VL, C? C, VR */
250 if(t(-2) & VLI
) return 0; /* VLI,C/C,VR .*/
251 else { /* vlao, C ? C , VR */
252 if(c(1) == TH_SARA_A
) return 2; /* vlao, C, C, sara_a/ */
253 if(t(-2) & VLO
) return 0; /* VLO, C/ C, !sara_a */
254 if(!(t(1) & VRA
)) return 0; /* VLA, C/ C, !vca */
258 if((t(-2) & C
) && (t(-1) & MT
) && (t(0) & CX
)) return 1;
264 int TrbFollowing(const th_char
*begin
, int length
, int offset
)
266 //(ThBreakIterator *this, int offset)
269 const th_char
*w
= begin
+ offset
;
270 const th_char
*end
= begin
+ length
;
271 while(w
< end
&& *w
&& !th_isthai(*w
) && th_isspace(*w
)) w
++;
273 if(w
< end
&& *w
&& !th_isthai(*w
)) {
275 while(w
< end
&& *w
&& !th_isthai(*w
) && !th_isspace(*w
)) {
276 if(th_isalpha(*w
)) english
= TRUE
;
279 if(english
|| w
== end
||
280 (!th_isthai(*w
) && th_isspace(*w
))) return w
- begin
;
282 if(w
== end
|| *w
== 0 || !th_isthai(*w
)) return w
- begin
;
284 if(w
< end
&& *w
&& th_isthai(*w
)) {
285 int brk
= TrbWordBreakPos(begin
, w
-begin
, w
, end
-w
);
288 if(w
== end
|| *w
== 0 || !th_isthai(*w
)) break;
289 brk
= TrbWordBreakPos(begin
, w
-begin
, w
, end
-w
);
291 if (brk
> 0) w
+= brk
;
293 if(w
< end
&& *w
&& !th_isthai(*w
)) {
294 while(w
< end
&& *w
&& !th_isthai(*w
) &&
295 !th_isalpha(*w
) && !th_isspace(*w
)) w
++;
302 /////////////////////////////////////////////////
304 const twb_t _TwbType
[0x100-0xa0] = {
307 /* 81-8f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
309 /* 91-9f */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
323 /* ac ¬ */ CC
| CHB
| CHE
,
328 /* b1 ± */ CS
| CHB
| CHE
,
329 /* b2 ² */ CS
| CHB
| CHE
,
346 /* c3 Ã */ CS
| C2
| CHE
, /* ? add CHE */
355 /* CC Ì */ CS
| CHB
| CHE
,
359 /* d0 Ð */ VRE
| VRA
,
361 /* d2 Ò */ VRX
| VRA
,
363 /* d4 Ô */ VRX
| VRA
,
364 /* d5 Õ */ VRX
| VRA
,
366 /* d7 × */ VRS
| VRA
,