1 // Simple functions to test UTF-8 characters.
2 // Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
3 // First version 2010-08
5 // Written for notepad++, and distributed under same license:
6 // This program is free software; you can redistribute it and/or
7 // modify it under the terms of the GNU General Public License
8 // as published by the Free Software Foundation; either
9 // version 2 of the License, or (at your option) any later version.
11 // This program is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this program; if not, write to the Free Software
18 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 namespace Utf8
{ // could be a static class, instead of a namespace, if it needs private members
23 // basic classification of UTF-8 bytes
24 inline static bool isSingleByte(unsigned char c
) { return c
< 0x80; }
25 inline static bool isPartOfMultibyte(unsigned char c
) { return c
>= 0x80; }
26 inline static bool isFirstOfMultibyte(unsigned char c
) { return c
>= 0xC2 && c
< 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
27 inline static bool isContinuation(unsigned char c
) { return (c
& 0xC0) == 0x80; }
28 inline static bool isValid(unsigned char c
) { return c
< 0xC0 || isFirstOfMultibyte(c
); } // validates a byte, out of context
30 // number of continuation bytes for a given valid first character (0 for single byte characters)
31 inline static int continuationBytes(unsigned char c
) {
32 static const char _len
[] = { 1,1,2,3 };
33 return (c
< 0xC0) ? 0 : _len
[(c
& 0x30) >> 4];
36 // validates a full character
37 inline static bool isValid(const unsigned char* buf
, int buflen
) {
38 if(isSingleByte(buf
[0])) return true; // single byte is valid
39 if(!isFirstOfMultibyte(buf
[0])) return false; // not single byte, nor valid multi-byte first byte
40 int charContinuationBytes
= continuationBytes(buf
[0]);
41 if(buflen
< charContinuationBytes
+1) return false; // character does not fit in buffer
42 for(int i
= charContinuationBytes
; i
>0; --i
)
43 if(!isContinuation(*(++buf
))) return false; // not enough continuation bytes
44 return true; // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
47 // rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
48 inline static int characterStart(const unsigned char* buf
, int startingIndex
) {
49 int charContinuationBytes
= 0;
50 while(charContinuationBytes
< startingIndex
// rewind past start of buffer?
51 && charContinuationBytes
< 5 // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
52 && isContinuation(buf
[startingIndex
-charContinuationBytes
])
54 ++charContinuationBytes
;
55 return startingIndex
-charContinuationBytes
;