1 /***********************************************************************
3 * This software is part of the ast package *
4 * Copyright (c) 1992-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
13 * Information and Software Systems Research *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
20 ***********************************************************************/
24 * AT&T Bell Laboratories
26 * library interface for word count
33 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
42 #define iswspace(x) isspace(x)
52 #define eol(c) ((c)&WC_NL)
53 #define mbc(c) ((c)&WC_MB)
54 #define spc(c) ((c)&WC_SP)
55 #define mb2wc(w,p,n) (*ast.mb_towc)(&w,(char*)p,n)
57 Wc_t
* wc_init(int mode
)
63 if (!(wp
= (Wc_t
*)stakalloc(sizeof(Wc_t
))))
67 #if _hdr_wchar && _hdr_wctype && _lib_iswctype
68 else if (!(mode
& WC_NOUTF8
) && (lcinfo(LC_CTYPE
)->lc
->flags
& LC_utf8
))
74 for (n
= (1<<CHAR_BIT
); --n
>= 0;)
75 wp
->type
[n
] = (w
&& isspace(n
)) ? WC_SP
: 0;
76 wp
->type
['\n'] = WC_SP
|WC_NL
;
77 if ((mode
& (WC_MBYTE
|WC_WORDS
)) && wp
->mb
> 0)
79 for (n
= 0; n
< 64; n
++)
81 wp
->type
[0x80+n
] |= WC_MB
;
83 wp
->type
[0xc0+n
] |= WC_MB
+1;
85 wp
->type
[0xc0+n
] |= WC_MB
+2;
87 wp
->type
[0xc0+n
] |= WC_MB
+3;
89 wp
->type
[0xc0+n
] |= WC_MB
+4;
91 wp
->type
[0xc0+n
] |= WC_MB
+5;
93 wp
->type
[0xc0] = WC_MB
|WC_ERR
;
94 wp
->type
[0xc1] = WC_MB
|WC_ERR
;
95 wp
->type
[0xfe] = WC_MB
|WC_ERR
;
96 wp
->type
[0xff] = WC_MB
|WC_ERR
;
102 static int invalid(const char *file
, int nlines
)
104 error_info
.file
= (char*)file
;
105 error_info
.line
= nlines
;
106 error(ERROR_SYSTEM
|1, "invalid multibyte character");
113 * handle utf space characters
116 static int chkstate(int state
, register unsigned int c
)
121 state
= (c
==0x9a?4:0);
124 state
= ((c
==0x80||c
==0x81)?6+(c
&1):0);
127 state
= (c
==0x80?5:0);
130 state
= (c
==0x80?10:0);
133 state
= (c
==0x80?10:0);
137 if(c
==0xa0 || c
==0xa1)
139 else if((c
&0xf0)== 0x80)
142 return(iswspace(0x2007)?10:0);
146 else if(c
==0xaf && iswspace(0x202f))
150 state
= (c
==0x9f?10:0);
153 return (iswspace(c
)?10:0);
159 * compute the line, word, and character count for file <fd>
162 int wc_count(Wc_t
*wp
, Sfio_t
*fd
, const char* file
)
164 register char* type
= wp
->type
;
165 register unsigned char* cp
;
166 register Sfoff_t nbytes
;
167 register Sfoff_t nchars
;
168 register Sfoff_t nwords
;
169 register Sfoff_t nlines
;
170 register Sfoff_t eline
= -1;
171 register Sfoff_t longest
= 0;
173 register unsigned char* endbuff
;
174 register int lasttype
= WC_SP
;
175 unsigned int lastchar
;
180 unsigned char side
[32];
182 sfset(fd
,SF_WRITE
,1);
183 nlines
= nwords
= nchars
= nbytes
= 0;
185 if (wp
->mb
< 0 && (wp
->mode
& (WC_MBYTE
|WC_WORDS
)))
187 cp
= buff
= endbuff
= 0;
190 if (cp
>= endbuff
|| (n
= mb2wc(x
, cp
, endbuff
-cp
)) < 0)
192 if ((o
= endbuff
-cp
) < sizeof(side
))
203 if (!(buff
= (unsigned char*)sfreserve(fd
, SF_UNBOUND
, 0)) || (n
= sfvalue(fd
)) <= 0)
205 if ((nchars
- longest
) > wp
->longest
)
206 wp
->longest
= nchars
- longest
;
210 if ((c
= sizeof(side
) - o
) > n
)
220 nchars
+= (cp
-side
) - 1;
223 cp
= buff
+ (cp
-side
) - o
;
230 if (x
== -1 && eline
!= nlines
&& !(wp
->mode
& WC_QUIET
))
231 eline
= invalid(file
, nlines
);
237 if ((nchars
- longest
) > wp
->longest
)
238 wp
->longest
= nchars
- longest
;
239 longest
= nchars
+ 1;
243 else if (iswspace(x
))
252 if (!(wp
->mode
& WC_MBYTE
))
255 else if (!wp
->mb
&& !(wp
->mode
& WC_LONGEST
) || wp
->mb
> 0 && !(wp
->mode
& (WC_MBYTE
|WC_WORDS
|WC_LONGEST
)))
257 if (!(wp
->mode
& (WC_MBYTE
|WC_WORDS
|WC_LONGEST
)))
259 while ((cp
= (unsigned char*)sfreserve(fd
, SF_UNBOUND
, 0)) && (c
= sfvalue(fd
)) > 0)
263 if (*--endbuff
== '\n')
278 while ((cp
= buff
= (unsigned char*)sfreserve(fd
, SF_UNBOUND
, 0)) && (c
= sfvalue(fd
)) > 0)
281 /* check to see whether first character terminates word */
286 if ((c
= type
[*cp
]) && !lasttype
)
291 if (!lasttype
&& type
[*cp
])
294 *(endbuff
= cp
+c
) = '\n';
296 /* process each buffer */
299 /* process spaces and new-lines */
305 /* check for end of buffer */
313 } while (c
= type
[*cp
++]);
314 /* skip over word characters */
315 while (!(c
= type
[*cp
++]));
319 if ((cp
-= 2) >= buff
)
323 lasttype
= type
[lastchar
];
324 /* see if was in word */
343 unsigned char* start
;
346 start
= (endbuff
= side
) + 1;
347 xspace
= iswspace(0xa0) || iswspace(0x85);
348 while ((cp
= buff
= (unsigned char*)sfreserve(fd
, SF_UNBOUND
, 0)) && (c
= sfvalue(fd
)) > 0)
353 /* check to see whether first character terminates word */
358 if((c
= type
[*cp
]) && !lasttype
)
372 if(!lasttype
&& spc(type
[*cp
]))
375 /* process each buffer */
378 /* process spaces and new-lines */
384 /* check for end of buffer */
387 if(wp
->mode
&WC_LONGEST
)
389 if((cp
-start
)-adjust
> longest
)
390 longest
= (cp
-start
)-adjust
-1;
397 } while (spc(c
= type
[*cp
++]));
418 if(skip
==2 && (cp
[-1]&0xc)==0 && (state
=(cp
[-1]&0x3)))
420 else if(xspace
&& cp
[-1]==0xc2)
429 if(state
&& (state
=chkstate(state
,oldc
)))
442 } while (mbc(c
= type
[*cp
++]));
446 if(eol(c
) && (cp
> endbuff
))
451 if(eline
!=nlines
&& !(wp
->mode
& WC_QUIET
))
452 eline
= invalid(file
, nlines
);
453 while(mbc(c
) && ((c
|WC_ERR
) || (c
&7)==0))
455 if(eol(c
) && (cp
> endbuff
))
471 /* skip over word characters */
472 while(!(c
= type
[*cp
++]));
479 if((cp
-= 2) >= buff
)
483 lasttype
= type
[lastchar
];
484 /* see if was in word */
488 if ((wp
->mode
&WC_LONGEST
) && ((endbuff
+ 1 - start
) - adjust
- (lastchar
== '\n')) > longest
)
489 longest
= (endbuff
+ 1 - start
) - adjust
- (lastchar
== '\n');
490 wp
->longest
= longest
;
495 if (wp
->mode
& WC_MBYTE
)