wtf/unicode/glib/UnicodeGLib.cpp

   1 /*
   2  *  Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
   3  *  Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
   4  *  Copyright (C) 2010 Igalia S.L.
   5  *
   6  *  This library is free software; you can redistribute it and/or
   7  *  modify it under the terms of the GNU Library General Public
   8  *  License as published by the Free Software Foundation; either
   9  *  version 2 of the License, or (at your option) any later version.
  10  *
  11  *  This library is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  *  Library General Public License for more details.
  15  *
  16  *  You should have received a copy of the GNU Library General Public License
  17  *  along with this library; see the file COPYING.LIB.  If not, write to
  18  *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  19  *  Boston, MA 02110-1301, USA.
  20  *
  21  */
  22
  23 #include "config.h"
  24 #include "UnicodeGLib.h"
  25
  26 #include <wtf/Vector.h>
  27 #include <wtf/unicode/UTF8.h>
  28
  29 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
  30
  31 namespace WTF {
  32 namespace Unicode {
  33
  34 UChar32 foldCase(UChar32 ch)
  35 {
  36     GOwnPtr<GError> gerror;
  37
  38     GOwnPtr<char> utf8char;
  39     utf8char.set(g_ucs4_to_utf8(reinterpret_cast<gunichar*>(&ch), 1, 0, 0, &gerror.outPtr()));
  40     if (gerror)
  41         return ch;
  42
  43     GOwnPtr<char> utf8caseFolded;
  44     utf8caseFolded.set(g_utf8_casefold(utf8char.get(), -1));
  45
  46     GOwnPtr<gunichar> ucs4Result;
  47     ucs4Result.set(g_utf8_to_ucs4_fast(utf8caseFolded.get(), -1, 0));
  48
  49     return *ucs4Result;
  50 }
  51
  52 static int getUTF16LengthFromUTF8(const gchar* utf8String, int length)
  53 {
  54     int utf16Length = 0;
  55     const gchar* inputString = utf8String;
  56
  57     while ((utf8String + length - inputString > 0) && *inputString) {
  58         gunichar character = g_utf8_get_char(inputString);
  59
  60         utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1;
  61         inputString = g_utf8_next_char(inputString);
  62     }
  63
  64     return utf16Length;
  65 }
  66
  67 typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length);
  68
  69 static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction)
  70 {
  71     *error = false;
  72
  73     // Allocate a buffer big enough to hold all the characters.
  74     Vector<char> buffer(srcLength * 3);
  75     char* utf8Target = buffer.data();
  76     const UChar* utf16Source = src;
  77     ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true);
  78     if (conversionResult != conversionOK) {
  79         *error = true;
  80         return -1;
  81     }
  82     buffer.shrink(utf8Target - buffer.data());
  83
  84     GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size()));
  85     long utf8ResultLength = strlen(utf8Result.get());
  86
  87     // Calculate the destination buffer size.
  88     int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength);
  89     if (realLength > resultLength) {
  90         *error = true;
  91         return realLength;
  92     }
  93
  94     // Convert the result to UTF-16.
  95     UChar* utf16Target = result;
  96     const char* utf8Source = utf8Result.get();
  97     conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true);
  98     long utf16ResultLength = utf16Target - result;
  99     if (conversionResult != conversionOK)
 100         *error = true;
 101
 102     return utf16ResultLength <= 0 ? -1 : utf16ResultLength;
 103 }
 104 int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
 105 {
 106     return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold);
 107 }
 108
 109 int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
 110 {
 111     return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown);
 112 }
 113
 114 int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
 115 {
 116     return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup);
 117 }
 118
 119 Direction direction(UChar32 c)
 120 {
 121     PangoBidiType type = pango_bidi_type_for_unichar(c);
 122     switch (type) {
 123     case PANGO_BIDI_TYPE_L:
 124         return LeftToRight;
 125     case PANGO_BIDI_TYPE_R:
 126         return RightToLeft;
 127     case PANGO_BIDI_TYPE_AL:
 128         return RightToLeftArabic;
 129     case PANGO_BIDI_TYPE_LRE:
 130         return LeftToRightEmbedding;
 131     case PANGO_BIDI_TYPE_RLE:
 132         return RightToLeftEmbedding;
 133     case PANGO_BIDI_TYPE_LRO:
 134         return LeftToRightOverride;
 135     case PANGO_BIDI_TYPE_RLO:
 136         return RightToLeftOverride;
 137     case PANGO_BIDI_TYPE_PDF:
 138         return PopDirectionalFormat;
 139     case PANGO_BIDI_TYPE_EN:
 140         return EuropeanNumber;
 141     case PANGO_BIDI_TYPE_AN:
 142         return ArabicNumber;
 143     case PANGO_BIDI_TYPE_ES:
 144         return EuropeanNumberSeparator;
 145     case PANGO_BIDI_TYPE_ET:
 146         return EuropeanNumberTerminator;
 147     case PANGO_BIDI_TYPE_CS:
 148         return CommonNumberSeparator;
 149     case PANGO_BIDI_TYPE_NSM:
 150         return NonSpacingMark;
 151     case PANGO_BIDI_TYPE_BN:
 152         return BoundaryNeutral;
 153     case PANGO_BIDI_TYPE_B:
 154         return BlockSeparator;
 155     case PANGO_BIDI_TYPE_S:
 156         return SegmentSeparator;
 157     case PANGO_BIDI_TYPE_WS:
 158         return WhiteSpaceNeutral;
 159     default:
 160         return OtherNeutral;
 161     }
 162 }
 163
 164 int umemcasecmp(const UChar* a, const UChar* b, int len)
 165 {
 166     GOwnPtr<char> utf8a;
 167     GOwnPtr<char> utf8b;
 168
 169     utf8a.set(g_utf16_to_utf8(a, len, 0, 0, 0));
 170     utf8b.set(g_utf16_to_utf8(b, len, 0, 0, 0));
 171
 172     GOwnPtr<char> foldedA;
 173     GOwnPtr<char> foldedB;
 174
 175     foldedA.set(g_utf8_casefold(utf8a.get(), -1));
 176     foldedB.set(g_utf8_casefold(utf8b.get(), -1));
 177
 178     // FIXME: umemcasecmp needs to mimic u_memcasecmp of icu
 179     // from the ICU docs:
 180     // "Compare two strings case-insensitively using full case folding.
 181     // his is equivalent to u_strcmp(u_strFoldCase(s1, n, options), u_strFoldCase(s2, n, options))."
 182     //
 183     // So it looks like we don't need the full g_utf8_collate here,
 184     // but really a bitwise comparison of casefolded unicode chars (not utf-8 bytes).
 185     // As there is no direct equivalent to this icu function in GLib, for now
 186     // we'll use g_utf8_collate():
 187
 188     return g_utf8_collate(foldedA.get(), foldedB.get());
 189 }
 190
 191 }
 192 }