6 #include "../hunspell/csutil.hxx"
7 #include "textparser.hxx"
13 // ISO-8859-1 HTML character entities
15 static const char * LATIN1
[] = {
47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
49 TextParser::TextParser() {
53 TextParser::TextParser(const char * wordchars
)
58 TextParser::TextParser(unsigned short * wordchars
, int len
)
63 TextParser::~TextParser()
67 int TextParser::is_wordchar(char * w
)
69 if (*w
== '\0') return 0;
74 idx
= (wc
.h
<< 8) + wc
.l
;
75 return (unicodeisalpha(idx
) || (wordchars_utf16
&& flag_bsearch(wordchars_utf16
, *((unsigned short *) &wc
), wclen
)));
77 return wordcharacters
[(*w
+ 256) % 256];
81 const char * TextParser::get_latin1(char * s
)
85 while ((i
< LATIN1_LEN
) &&
86 strncmp(LATIN1
[i
], s
, strlen(LATIN1
[i
]))) i
++;
87 if (i
!= LATIN1_LEN
) return LATIN1
[i
];
92 void TextParser::init(const char * wordchars
)
94 for (int i
= 0; i
< MAXPREVLINE
; i
++) {
104 for (j
= 0; j
< 256; j
++) {
105 wordcharacters
[j
] = 0;
107 if (!wordchars
) wordchars
= "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
108 for (j
= 0; j
< strlen(wordchars
); j
++) {
109 wordcharacters
[(wordchars
[j
] + 256) % 256] = 1;
113 void TextParser::init(unsigned short * wc
, int len
)
115 for (int i
= 0; i
< MAXPREVLINE
; i
++) {
124 wordchars_utf16
= wc
;
128 int TextParser::next_char(char * line
, int * pos
) {
129 if (*(line
+ *pos
) == '\0') return 1;
131 if (*(line
+ *pos
) >> 7) {
132 // jump to next UTF-8 character
133 for((*pos
)++; (*(line
+ *pos
) & 0xc0) == 0x80; (*pos
)++);
141 void TextParser::put_line(char * word
)
143 actual
= (actual
+ 1) % MAXPREVLINE
;
144 strcpy(line
[actual
], word
);
150 char * TextParser::get_prevline(int n
)
152 return mystrdup(line
[(actual
+ MAXPREVLINE
- n
) % MAXPREVLINE
]);
155 char * TextParser::get_line()
157 return get_prevline(0);
160 char * TextParser::next_token()
167 case 0: // non word chars
168 if (is_wordchar(line
[actual
] + head
)) {
171 } else if ((latin1
= get_latin1(line
[actual
] + head
))) {
174 head
+= strlen(latin1
);
178 if ((latin1
= get_latin1(line
[actual
] + head
))) {
179 head
+= strlen(latin1
);
180 } else if (! is_wordchar(line
[actual
] + head
)) {
182 char * t
= alloc_token(token
, &head
);
187 if (next_char(line
[actual
], &head
)) return NULL
;
191 int TextParser::get_tokenpos()
196 int TextParser::change_token(const char * word
)
199 char * r
= mystrdup(line
[actual
] + head
);
200 strcpy(line
[actual
] + token
, word
);
201 strcat(line
[actual
], r
);
209 void TextParser::check_urls()
218 case 0: // non word chars
219 if (is_wordchar(line
[actual
] + url_head
)) {
221 url_token
= url_head
;
223 } else if (*(line
[actual
] + url_head
) == '/') {
225 url_token
= url_head
;
230 char ch
= *(line
[actual
] + url_head
);
233 // MS-DOS, Windows path
234 (strncmp(line
[actual
] + url_head
, ":\\", 2) == 0) ||
236 (strncmp(line
[actual
] + url_head
, "://", 3) == 0)) {
238 } else if (! (is_wordchar(line
[actual
] + url_head
) ||
239 (ch
== '-') || (ch
== '_') || (ch
== '\\') ||
240 (ch
== '.') || (ch
== ':') || (ch
== '/') ||
241 (ch
== '~') || (ch
== '%') || (ch
== '*') ||
242 (ch
== '$') || (ch
== '[') || (ch
== ']') ||
243 (ch
== '?') || (ch
== '!') ||
244 ((ch
>= '0') && (ch
<= '9')))) {
247 for (int i
= url_token
; i
< url_head
; i
++) {
255 *(urlline
+ url_head
) = 0;
256 if (next_char(line
[actual
], &url_head
)) return;
260 int TextParser::get_url(int token_pos
, int * head
)
262 for (int i
= *head
; urlline
[i
] && *(line
[actual
]+i
); i
++, (*head
)++);
263 return checkurl
? 0 : urlline
[token_pos
];
266 void TextParser::set_url_checking(int check
)
272 char * TextParser::alloc_token(int token
, int * head
)
274 if (get_url(token
, head
)) return NULL
;
275 char * t
= (char *) malloc(*head
- token
+ 1);
277 t
[*head
- token
] = '\0';
278 strncpy(t
, line
[actual
] + token
, *head
- token
);
279 // remove colon for Finnish and Swedish language
280 if (t
[*head
- token
- 1] == ':') {
281 t
[*head
- token
- 1] = '\0';
289 fprintf(stderr
,"Error - Insufficient Memory\n");