]> git.saurik.com Git - wxWidgets.git/blobdiff - src/stc/scintilla/src/UniConversion.cxx
Initial copy of Scintilla 3.21 code
[wxWidgets.git] / src / stc / scintilla / src / UniConversion.cxx
index 7dbe9e23de87457dba6ddea3c113a38bd6126e5d..ffe67f75c4439d203885e3c7e542961c66c4db81 100644 (file)
@@ -1,6 +1,6 @@
 // Scintilla source code edit control
 /** @file UniConversion.cxx
 // Scintilla source code edit control
 /** @file UniConversion.cxx
- ** Functions to handle UFT-8 and UCS-2 strings.
+ ** Functions to handle UTF-8 and UTF-16 strings.
  **/
 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
 // The License.txt file describes the conditions under which this software may be distributed.
  **/
 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
 // The License.txt file describes the conditions under which this software may be distributed.
@@ -61,10 +61,22 @@ void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned
        putf[len] = '\0';
 }
 
        putf[len] = '\0';
 }
 
+unsigned int UTF8CharLength(unsigned char ch) {
+       if (ch < 0x80) {
+               return 1;
+       } else if (ch < 0x80 + 0x40 + 0x20) {
+               return 2;
+       } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
+               return 3;
+       } else {
+               return 4;
+       }
+}
+
 unsigned int UTF16Length(const char *s, unsigned int len) {
        unsigned int ulen = 0;
        unsigned int charLen;
 unsigned int UTF16Length(const char *s, unsigned int len) {
        unsigned int ulen = 0;
        unsigned int charLen;
-       for (unsigned int i=0;i<len;) {
+       for (unsigned int i=0; i<len;) {
                unsigned char ch = static_cast<unsigned char>(s[i]);
                if (ch < 0x80) {
                        charLen = 1;
                unsigned char ch = static_cast<unsigned char>(s[i]);
                if (ch < 0x80) {
                        charLen = 1;
@@ -117,3 +129,120 @@ unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsig
        }
        return ui;
 }
        }
        return ui;
 }
+
+int UTF8BytesOfLead[256];
+static bool initialisedBytesOfLead = false;
+
+static int BytesFromLead(int leadByte) {
+       if (leadByte < 0xC2) {
+               // Single byte or invalid
+               return 1;
+       } else if (leadByte < 0xE0) {
+               return 2;
+       } else if (leadByte < 0xF0) {
+               return 3;
+       } else if (leadByte < 0xF5) {
+               return 4;
+       } else {
+               // Characters longer than 4 bytes not possible in current UTF-8
+               return 1;
+       }
+}
+
+void UTF8BytesOfLeadInitialise() {
+       if (!initialisedBytesOfLead) {
+               for (int i=0;i<256;i++) {
+                       UTF8BytesOfLead[i] = BytesFromLead(i);
+               }
+               initialisedBytesOfLead = true;
+       }
+}
+
+// Return both the width of the first character in the string and a status
+// saying whether it is valid or invalid.
+// Most invalid sequences return a width of 1 so are treated as isolated bytes but
+// the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
+// reasonably treated as code points in some circumstances. They will, however,
+// not have associated glyphs.
+int UTF8Classify(const unsigned char *us, int len) {
+       // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
+       if (*us < 0x80) {
+               // Single bytes easy
+               return 1;
+       } else if (*us > 0xf4) {
+               // Characters longer than 4 bytes not possible in current UTF-8
+               return UTF8MaskInvalid | 1;
+       } else if (*us >= 0xf0) {
+               // 4 bytes
+               if (len < 4)
+                       return UTF8MaskInvalid | 1;
+               if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
+                       if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
+                               // *FFFE or *FFFF non-character
+                               return UTF8MaskInvalid | 4;
+                       }
+                       if (*us == 0xf4) {
+                               // Check if encoding a value beyond the last Unicode character 10FFFF
+                               if (us[1] > 0x8f) {
+                                       return UTF8MaskInvalid | 1;
+                               } else if (us[1] == 0x8f) {
+                                       if (us[2] > 0xbf) {
+                                               return UTF8MaskInvalid | 1;
+                                       } else if (us[2] == 0xbf) {
+                                               if (us[3] > 0xbf) {
+                                                       return UTF8MaskInvalid | 1;
+                                               }
+                                       }
+                               }
+                       } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
+                               // Overlong
+                               return UTF8MaskInvalid | 1;
+                       }
+                       return 4;
+               } else {
+                       return UTF8MaskInvalid | 1;
+               }
+       } else if (*us >= 0xe0) {
+               // 3 bytes
+               if (len < 3)
+                       return UTF8MaskInvalid | 1;
+               if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
+                       if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
+                               // Overlong
+                               return UTF8MaskInvalid | 1;
+                       }
+                       if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
+                               // Surrogate
+                               return UTF8MaskInvalid | 1;
+                       }
+                       if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
+                               // U+FFFE non-character - 3 bytes long
+                               return UTF8MaskInvalid | 3;
+                       }
+                       if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
+                               // U+FFFF non-character - 3 bytes long
+                               return UTF8MaskInvalid | 3;
+                       }
+                       if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
+                               // U+FDD0 .. U+FDEF
+                               return UTF8MaskInvalid | 3;
+                       }
+                       return 3;
+               } else {
+                       return UTF8MaskInvalid | 1;
+               }
+       } else if (*us >= 0xc2) {
+               // 2 bytes
+               if (len < 2)
+                       return UTF8MaskInvalid | 1;
+               if (UTF8IsTrailByte(us[1])) {
+                       return 2;
+               } else {
+                       return UTF8MaskInvalid | 1;
+               }
+       } else {
+               // 0xc0 .. 0xc1 is overlong encoding
+               // 0x80 .. 0xbf is trail byte
+               return UTF8MaskInvalid | 1;
+       }
+}