softcount: tolerate zero ngrams
[vspell.git] / libvspell / tokenize.ll
blob9be4784be11d90d1e8e819db1193dbdccf3e4dc0
1 %option noyywrap
3 %{
4 /* tokenize.l from SATZ, modified by pclouds */
6 /* 
7 %option c++
8 %option outfile="lex.yy.c"
9 */
13 LEFT_COMMENT                    \<
14 P                               p
15 S                               s
16 COMMENT_SLASH                   \/
17 RIGHT_COMMENT                   \>
19 {LEFT_COMMENT}{P}{RIGHT_COMMENT}                                |
20 {LEFT_COMMENT}{S}{RIGHT_COMMENT}                                |
21 {LEFT_COMMENT}{COMMENT_SLASH}{P}{RIGHT_COMMENT}                 ;
22 {LEFT_COMMENT}{COMMENT_SLASH}{S}{RIGHT_COMMENT}       {printf("**end**\n"); } ;
26 #include "tokenize.h"
28 using namespace std;
29 static const char *buffer;
30 static int pos,len;
31 static Tokens *out;
33 #define YY_INPUT(buf,result,max_size) \
34   { \
35     result = (pos == len) ? YY_NULL : (buf[0] = buffer[pos++], 1); \
36   }
39 SENTENCE_FINAL                  [.?!]
40 HYPHEN                          [\-]
41 OPEN_SINGLE_QUOTE               [\`]
42 CLOSE_SINGLE_QUOTE              [\']
43 RIGHT_PAREN                     [\"\)\]\}\>\']
46 LETTERS_AND_NUMBERS             [a-zA-Z0-9\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x2\x5\x6\x14\x19\x1e]
47 LETTERS_NUMBER_AND_THEN_SOME    [a-zA-Z0-9\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x2\x5\x6\x14\x19\x1e\.\,\:\'\$\%\-\\\/\&\177]
48 APOSTROPHE                      \'
50 SINGLE_CHARACTER                [a-zA-Z0-9\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x2\x5\x6\x14\x19\x1e\#\_\;\!\?\@\*\+\=\~\|\^\&\,\:\$\%\\\/\177\(\)\[\]\{\}\<\>\"\177]
52 WHITE_SPACE                     [ \t\n]
53 NEWLINE                         [\n]
54 INVISIBLE                       [^\040-\176\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff\x2\x5\x6\x14\x19\x1e]
61 {SENTENCE_FINAL}+{RIGHT_PAREN}*                                 |
62 {HYPHEN}+                                                       |
63 {OPEN_SINGLE_QUOTE}+                                            |
64 {CLOSE_SINGLE_QUOTE}+                                           |
66 {LETTERS_NUMBER_AND_THEN_SOME}+{LETTERS_AND_NUMBERS}            |
67 {LETTERS_AND_NUMBERS}+{APOSTROPHE}                              |
69 {SINGLE_CHARACTER}                   { out->push_back(Token(true,yytext)); } ;
71 ({WHITE_SPACE}|{INVISIBLE}|{NEWLINE})+                          { out->push_back(Token(false,yytext)); } ;
74 bool tokenize(const string& s,Tokens &tokens)
76   buffer = s.c_str();
77   pos = 0;
78   len = s.size();
79   out = &tokens;
80   return yylex() == 0;