include/Utf8.h

   1 // Simple functions to test UTF-8 characters.
   2 // Copyright (C)2010 Francois-R.Boyer@PolyMtl.ca
   3 // First version 2010-08
   4 //
   5 // Written for notepad++, and distributed under same license:
   6 // This program is free software; you can redistribute it and/or
   7 // modify it under the terms of the GNU General Public License
   8 // as published by the Free Software Foundation; either
   9 // version 2 of the License, or (at your option) any later version.
  10
  11 // This program is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // You should have received a copy of the GNU General Public License
  17 // along with this program; if not, write to the Free Software
  18 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19
  20 #pragma once
  21
  22 namespace Utf8 { // could be a static class, instead of a namespace, if it needs private members
  23         // basic classification of UTF-8 bytes
  24         inline static bool isSingleByte(unsigned char c)       { return c < 0x80; }
  25         inline static bool isPartOfMultibyte(unsigned char c)  { return c >= 0x80; }
  26         inline static bool isFirstOfMultibyte(unsigned char c) { return c >= 0xC2 && c < 0xF5; } // 0xF5 to 0xFD are defined by UTF-8, but are not currently valid Unicode
  27         inline static bool isContinuation(unsigned char c)     { return (c & 0xC0) == 0x80; }
  28         inline static bool isValid(unsigned char c)            { return c < 0xC0 || isFirstOfMultibyte(c); }    // validates a byte, out of context
  29
  30         // number of continuation bytes for a given valid first character (0 for single byte characters)
  31         inline static int  continuationBytes(unsigned char c)  {
  32                 static const char _len[] = { 1,1,2,3 };
  33                 return (c < 0xC0) ? 0 : _len[(c & 0x30) >>  4];
  34         }
  35
  36         // validates a full character
  37         inline static bool isValid(const unsigned char* buf, int buflen) {
  38                 if(isSingleByte(buf[0])) return true; // single byte is valid
  39                 if(!isFirstOfMultibyte(buf[0])) return false; // not single byte, nor valid multi-byte first byte
  40                 int charContinuationBytes = continuationBytes(buf[0]);
  41                 if(buflen < charContinuationBytes+1) return false; // character does not fit in buffer
  42                 for(int i = charContinuationBytes; i>0; --i)
  43                         if(!isContinuation(*(++buf))) return false; // not enough continuation bytes
  44                 return true;  // the character is valid (if there are too many continuation bytes, it is the next character that will be invalid)
  45         }
  46
  47         // rewinds to the first byte of a multi-byte character for any valid UTF-8 (and will not rewind too much on any other input)
  48         inline static int characterStart(const unsigned char* buf, int startingIndex) {
  49                 int charContinuationBytes = 0;
  50                 while(charContinuationBytes < startingIndex     // rewind past start of buffer?
  51                         && charContinuationBytes < 5    // UTF-8 support up to 5 continuation bytes (but valid sequences currently do not have more than 3)
  52                         && isContinuation(buf[startingIndex-charContinuationBytes])
  53                         )
  54                         ++charContinuationBytes;
  55                 return startingIndex-charContinuationBytes;
  56         }
  57 };