src/stc/scintilla/src/UniConversion.cxx

   1 // Scintilla source code edit control
   2 /** @file UniConversion.cxx
   3  ** Functions to handle UTF-8 and UTF-16 strings.
   4  **/
   5 // Copyright 1998-2001 by Neil Hodgson <neilh@scintilla.org>
   6 // The License.txt file describes the conditions under which this software may be distributed.
   7
   8 #include <stdlib.h>
   9
  10 #include "UniConversion.h"
  11
  12 enum { SURROGATE_LEAD_FIRST = 0xD800 };
  13 enum { SURROGATE_TRAIL_FIRST = 0xDC00 };
  14 enum { SURROGATE_TRAIL_LAST = 0xDFFF };
  15
  16 unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) {
  17         unsigned int len = 0;
  18         for (unsigned int i = 0; i < tlen && uptr[i];) {
  19                 unsigned int uch = uptr[i];
  20                 if (uch < 0x80) {
  21                         len++;
  22                 } else if (uch < 0x800) {
  23                         len += 2;
  24                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  25                         (uch <= SURROGATE_TRAIL_LAST)) {
  26                         len += 4;
  27                         i++;
  28                 } else {
  29                         len += 3;
  30                 }
  31                 i++;
  32         }
  33         return len;
  34 }
  35
  36 void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) {
  37         int k = 0;
  38         for (unsigned int i = 0; i < tlen && uptr[i];) {
  39                 unsigned int uch = uptr[i];
  40                 if (uch < 0x80) {
  41                         putf[k++] = static_cast<char>(uch);
  42                 } else if (uch < 0x800) {
  43                         putf[k++] = static_cast<char>(0xC0 | (uch >> 6));
  44                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  45                 } else if ((uch >= SURROGATE_LEAD_FIRST) &&
  46                         (uch <= SURROGATE_TRAIL_LAST)) {
  47                         // Half a surrogate pair
  48                         i++;
  49                         unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff);
  50                         putf[k++] = static_cast<char>(0xF0 | (xch >> 18));
  51                         putf[k++] = static_cast<char>(0x80 | ((xch >> 12) & 0x3f));
  52                         putf[k++] = static_cast<char>(0x80 | ((xch >> 6) & 0x3f));
  53                         putf[k++] = static_cast<char>(0x80 | (xch & 0x3f));
  54                 } else {
  55                         putf[k++] = static_cast<char>(0xE0 | (uch >> 12));
  56                         putf[k++] = static_cast<char>(0x80 | ((uch >> 6) & 0x3f));
  57                         putf[k++] = static_cast<char>(0x80 | (uch & 0x3f));
  58                 }
  59                 i++;
  60         }
  61         putf[len] = '\0';
  62 }
  63
  64 unsigned int UTF8CharLength(unsigned char ch) {
  65         if (ch < 0x80) {
  66                 return 1;
  67         } else if (ch < 0x80 + 0x40 + 0x20) {
  68                 return 2;
  69         } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  70                 return 3;
  71         } else {
  72                 return 4;
  73         }
  74 }
  75
  76 unsigned int UTF16Length(const char *s, unsigned int len) {
  77         unsigned int ulen = 0;
  78         unsigned int charLen;
  79         for (unsigned int i=0; i<len;) {
  80                 unsigned char ch = static_cast<unsigned char>(s[i]);
  81                 if (ch < 0x80) {
  82                         charLen = 1;
  83                 } else if (ch < 0x80 + 0x40 + 0x20) {
  84                         charLen = 2;
  85                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
  86                         charLen = 3;
  87                 } else {
  88                         charLen = 4;
  89                         ulen++;
  90                 }
  91                 i += charLen;
  92                 ulen++;
  93         }
  94         return ulen;
  95 }
  96
  97 unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) {
  98         unsigned int ui=0;
  99         const unsigned char *us = reinterpret_cast<const unsigned char *>(s);
 100         unsigned int i=0;
 101         while ((i<len) && (ui<tlen)) {
 102                 unsigned char ch = us[i++];
 103                 if (ch < 0x80) {
 104                         tbuf[ui] = ch;
 105                 } else if (ch < 0x80 + 0x40 + 0x20) {
 106                         tbuf[ui] = static_cast<wchar_t>((ch & 0x1F) << 6);
 107                         ch = us[i++];
 108                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 109                 } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) {
 110                         tbuf[ui] = static_cast<wchar_t>((ch & 0xF) << 12);
 111                         ch = us[i++];
 112                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + ((ch & 0x7F) << 6));
 113                         ch = us[i++];
 114                         tbuf[ui] = static_cast<wchar_t>(tbuf[ui] + (ch & 0x7F));
 115                 } else {
 116                         // Outside the BMP so need two surrogates
 117                         int val = (ch & 0x7) << 18;
 118                         ch = us[i++];
 119                         val += (ch & 0x3F) << 12;
 120                         ch = us[i++];
 121                         val += (ch & 0x3F) << 6;
 122                         ch = us[i++];
 123                         val += (ch & 0x3F);
 124                         tbuf[ui] = static_cast<wchar_t>(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST);
 125                         ui++;
 126                         tbuf[ui] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
 127                 }
 128                 ui++;
 129         }
 130         return ui;
 131 }
 132
 133 int UTF8BytesOfLead[256];
 134 static bool initialisedBytesOfLead = false;
 135
 136 static int BytesFromLead(int leadByte) {
 137         if (leadByte < 0xC2) {
 138                 // Single byte or invalid
 139                 return 1;
 140         } else if (leadByte < 0xE0) {
 141                 return 2;
 142         } else if (leadByte < 0xF0) {
 143                 return 3;
 144         } else if (leadByte < 0xF5) {
 145                 return 4;
 146         } else {
 147                 // Characters longer than 4 bytes not possible in current UTF-8
 148                 return 1;
 149         }
 150 }
 151
 152 void UTF8BytesOfLeadInitialise() {
 153         if (!initialisedBytesOfLead) {
 154                 for (int i=0;i<256;i++) {
 155                         UTF8BytesOfLead[i] = BytesFromLead(i);
 156                 }
 157                 initialisedBytesOfLead = true;
 158         }
 159 }
 160
 161 // Return both the width of the first character in the string and a status
 162 // saying whether it is valid or invalid.
 163 // Most invalid sequences return a width of 1 so are treated as isolated bytes but
 164 // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be
 165 // reasonably treated as code points in some circumstances. They will, however,
 166 // not have associated glyphs.
 167 int UTF8Classify(const unsigned char *us, int len) {
 168         // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
 169         if (*us < 0x80) {
 170                 // Single bytes easy
 171                 return 1;
 172         } else if (*us > 0xf4) {
 173                 // Characters longer than 4 bytes not possible in current UTF-8
 174                 return UTF8MaskInvalid | 1;
 175         } else if (*us >= 0xf0) {
 176                 // 4 bytes
 177                 if (len < 4)
 178                         return UTF8MaskInvalid | 1;
 179                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) {
 180                         if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) {
 181                                 // *FFFE or *FFFF non-character
 182                                 return UTF8MaskInvalid | 4;
 183                         }
 184                         if (*us == 0xf4) {
 185                                 // Check if encoding a value beyond the last Unicode character 10FFFF
 186                                 if (us[1] > 0x8f) {
 187                                         return UTF8MaskInvalid | 1;
 188                                 } else if (us[1] == 0x8f) {
 189                                         if (us[2] > 0xbf) {
 190                                                 return UTF8MaskInvalid | 1;
 191                                         } else if (us[2] == 0xbf) {
 192                                                 if (us[3] > 0xbf) {
 193                                                         return UTF8MaskInvalid | 1;
 194                                                 }
 195                                         }
 196                                 }
 197                         } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) {
 198                                 // Overlong
 199                                 return UTF8MaskInvalid | 1;
 200                         }
 201                         return 4;
 202                 } else {
 203                         return UTF8MaskInvalid | 1;
 204                 }
 205         } else if (*us >= 0xe0) {
 206                 // 3 bytes
 207                 if (len < 3)
 208                         return UTF8MaskInvalid | 1;
 209                 if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) {
 210                         if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) {
 211                                 // Overlong
 212                                 return UTF8MaskInvalid | 1;
 213                         }
 214                         if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) {
 215                                 // Surrogate
 216                                 return UTF8MaskInvalid | 1;
 217                         }
 218                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) {
 219                                 // U+FFFE non-character - 3 bytes long
 220                                 return UTF8MaskInvalid | 3;
 221                         }
 222                         if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) {
 223                                 // U+FFFF non-character - 3 bytes long
 224                                 return UTF8MaskInvalid | 3;
 225                         }
 226                         if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) {
 227                                 // U+FDD0 .. U+FDEF
 228                                 return UTF8MaskInvalid | 3;
 229                         }
 230                         return 3;
 231                 } else {
 232                         return UTF8MaskInvalid | 1;
 233                 }
 234         } else if (*us >= 0xc2) {
 235                 // 2 bytes
 236                 if (len < 2)
 237                         return UTF8MaskInvalid | 1;
 238                 if (UTF8IsTrailByte(us[1])) {
 239                         return 2;
 240                 } else {
 241                         return UTF8MaskInvalid | 1;
 242                 }
 243         } else {
 244                 // 0xc0 .. 0xc1 is overlong encoding
 245                 // 0x80 .. 0xbf is trail byte
 246                 return UTF8MaskInvalid | 1;
 247         }
 248 }