[apple/javascriptcore.git] / wtf / unicode / glib / UnicodeGLib.cpp

/*
 *  Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
 *  Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
 *  Copyright (C) 2010 Igalia S.L.
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Library General Public
 *  License as published by the Free Software Foundation; either
 *  version 2 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Library General Public License for more details.
 *
 *  You should have received a copy of the GNU Library General Public License
 *  along with this library; see the file COPYING.LIB.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 *  Boston, MA 02110-1301, USA.
 *
 */

#include "config.h"
#include "UnicodeGLib.h"

#include <wtf/Vector.h>
#include <wtf/unicode/UTF8.h>

#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)

namespace WTF {
namespace Unicode {

UChar32 foldCase(UChar32 ch)
{
    GOwnPtr<GError> gerror;

    GOwnPtr<char> utf8char;
    utf8char.set(g_ucs4_to_utf8(reinterpret_cast<gunichar*>(&ch), 1, 0, 0, &gerror.outPtr()));
    if (gerror)
        return ch;

    GOwnPtr<char> utf8caseFolded;
    utf8caseFolded.set(g_utf8_casefold(utf8char.get(), -1));

    GOwnPtr<gunichar> ucs4Result;
    ucs4Result.set(g_utf8_to_ucs4_fast(utf8caseFolded.get(), -1, 0));

    return *ucs4Result;
}

static int getUTF16LengthFromUTF8(const gchar* utf8String, int length)
{
    int utf16Length = 0;
    const gchar* inputString = utf8String;

    while ((utf8String + length - inputString > 0) && *inputString) {
        gunichar character = g_utf8_get_char(inputString);

        utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1;
        inputString = g_utf8_next_char(inputString);
    }

    return utf16Length;
}

typedef gchar* (*UTF8CaseFunction)(const gchar*, gssize length);

static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction)
{
    *error = false;

    // Allocate a buffer big enough to hold all the characters.
    Vector<char> buffer(srcLength * 3);
    char* utf8Target = buffer.data();
    const UChar* utf16Source = src;
    ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true);
    if (conversionResult != conversionOK) {
        *error = true;
        return -1;
    }
    buffer.shrink(utf8Target - buffer.data());

    GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size()));
    long utf8ResultLength = strlen(utf8Result.get());

    // Calculate the destination buffer size.
    int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength);
    if (realLength > resultLength) {
        *error = true;
        return realLength;
    }

    // Convert the result to UTF-16.
    UChar* utf16Target = result;
    const char* utf8Source = utf8Result.get();
    conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true);
    long utf16ResultLength = utf16Target - result;
    if (conversionResult != conversionOK)
        *error = true;

    return utf16ResultLength <= 0 ? -1 : utf16ResultLength;
}
int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
{
    return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold);
}

int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
{
    return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown);
}

int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
{
    return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup);
}

Direction direction(UChar32 c)
{
    PangoBidiType type = pango_bidi_type_for_unichar(c);
    switch (type) {
    case PANGO_BIDI_TYPE_L:
        return LeftToRight;
    case PANGO_BIDI_TYPE_R:
        return RightToLeft;
    case PANGO_BIDI_TYPE_AL:
        return RightToLeftArabic;
    case PANGO_BIDI_TYPE_LRE:
        return LeftToRightEmbedding;
    case PANGO_BIDI_TYPE_RLE:
        return RightToLeftEmbedding;
    case PANGO_BIDI_TYPE_LRO:
        return LeftToRightOverride;
    case PANGO_BIDI_TYPE_RLO:
        return RightToLeftOverride;
    case PANGO_BIDI_TYPE_PDF:
        return PopDirectionalFormat;
    case PANGO_BIDI_TYPE_EN:
        return EuropeanNumber;
    case PANGO_BIDI_TYPE_AN:
        return ArabicNumber;
    case PANGO_BIDI_TYPE_ES:
        return EuropeanNumberSeparator;
    case PANGO_BIDI_TYPE_ET:
        return EuropeanNumberTerminator;
    case PANGO_BIDI_TYPE_CS:
        return CommonNumberSeparator;
    case PANGO_BIDI_TYPE_NSM:
        return NonSpacingMark;
    case PANGO_BIDI_TYPE_BN:
        return BoundaryNeutral;
    case PANGO_BIDI_TYPE_B:
        return BlockSeparator;
    case PANGO_BIDI_TYPE_S:
        return SegmentSeparator;
    case PANGO_BIDI_TYPE_WS:
        return WhiteSpaceNeutral;
    default:
        return OtherNeutral;
    }
}

int umemcasecmp(const UChar* a, const UChar* b, int len)
{
    GOwnPtr<char> utf8a;
    GOwnPtr<char> utf8b;

    utf8a.set(g_utf16_to_utf8(a, len, 0, 0, 0));
    utf8b.set(g_utf16_to_utf8(b, len, 0, 0, 0));

    GOwnPtr<char> foldedA;
    GOwnPtr<char> foldedB;

    foldedA.set(g_utf8_casefold(utf8a.get(), -1));
    foldedB.set(g_utf8_casefold(utf8b.get(), -1));

    // FIXME: umemcasecmp needs to mimic u_memcasecmp of icu
    // from the ICU docs:
    // "Compare two strings case-insensitively using full case folding.
    // his is equivalent to u_strcmp(u_strFoldCase(s1, n, options), u_strFoldCase(s2, n, options))."
    //
    // So it looks like we don't need the full g_utf8_collate here,
    // but really a bitwise comparison of casefolded unicode chars (not utf-8 bytes).
    // As there is no direct equivalent to this icu function in GLib, for now
    // we'll use g_utf8_collate():

    return g_utf8_collate(foldedA.get(), foldedB.get());
}

}
}
Commit	Line	Data
ba379fdc A	1	/*
	2	* Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
	3	* Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
14957cd0	4	* Copyright (C) 2010 Igalia S.L.
ba379fdc A	5	*
	6	* This library is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU Library General Public
	8	* License as published by the Free Software Foundation; either
	9	* version 2 of the License, or (at your option) any later version.
	10	*
	11	* This library is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	* Library General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU Library General Public License
	17	* along with this library; see the file COPYING.LIB. If not, write to
	18	* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
	19	* Boston, MA 02110-1301, USA.
	20	*
	21	*/
	22
f9bf01c6	23	#include "config.h"
ba379fdc A	24	#include "UnicodeGLib.h"
ba379fdc A	25
14957cd0 A	26	#include <wtf/Vector.h>
	27	#include <wtf/unicode/UTF8.h>
	28
	29	#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
	30
ba379fdc A	31	namespace WTF {
	32	namespace Unicode {
	33
	34	UChar32 foldCase(UChar32 ch)
	35	{
	36	GOwnPtr<GError> gerror;
	37
	38	GOwnPtr<char> utf8char;
	39	utf8char.set(g_ucs4_to_utf8(reinterpret_cast<gunichar*>(&ch), 1, 0, 0, &gerror.outPtr()));
	40	if (gerror)
	41	return ch;
	42
	43	GOwnPtr<char> utf8caseFolded;
	44	utf8caseFolded.set(g_utf8_casefold(utf8char.get(), -1));
	45
	46	GOwnPtr<gunichar> ucs4Result;
	47	ucs4Result.set(g_utf8_to_ucs4_fast(utf8caseFolded.get(), -1, 0));
	48
	49	return *ucs4Result;
	50	}
	51
14957cd0	52	static int getUTF16LengthFromUTF8(const gchar* utf8String, int length)
ba379fdc	53	{
14957cd0 A	54	int utf16Length = 0;
14957cd0 A	55	const gchar* inputString = utf8String;
ba379fdc	56
14957cd0 A	57	while ((utf8String + length - inputString > 0) && *inputString) {
14957cd0 A	58	gunichar character = g_utf8_get_char(inputString);
ba379fdc	59
14957cd0 A	60	utf16Length += UTF8_IS_SURROGATE(character) ? 2 : 1;
14957cd0 A	61	inputString = g_utf8_next_char(inputString);
ba379fdc A	62	}
ba379fdc A	63
14957cd0	64	return utf16Length;
ba379fdc A	65	}
ba379fdc A	66
14957cd0 A	67	typedef gchar* (UTF8CaseFunction)(const gchar, gssize length);
	68
	69	static int convertCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error, UTF8CaseFunction caseFunction)
ba379fdc A	70	{
ba379fdc A	71	*error = false;
ba379fdc	72
14957cd0 A	73	// Allocate a buffer big enough to hold all the characters.
	74	Vector<char> buffer(srcLength * 3);
	75	char* utf8Target = buffer.data();
	76	const UChar* utf16Source = src;
	77	ConversionResult conversionResult = convertUTF16ToUTF8(&utf16Source, utf16Source + srcLength, &utf8Target, utf8Target + buffer.size(), true);
	78	if (conversionResult != conversionOK) {
ba379fdc A	79	*error = true;
	80	return -1;
	81	}
14957cd0	82	buffer.shrink(utf8Target - buffer.data());
ba379fdc	83
14957cd0 A	84	GOwnPtr<char> utf8Result(caseFunction(buffer.data(), buffer.size()));
14957cd0 A	85	long utf8ResultLength = strlen(utf8Result.get());
ba379fdc	86
14957cd0 A	87	// Calculate the destination buffer size.
	88	int realLength = getUTF16LengthFromUTF8(utf8Result.get(), utf8ResultLength);
	89	if (realLength > resultLength) {
ba379fdc	90	*error = true;
14957cd0	91	return realLength;
ba379fdc A	92	}
ba379fdc A	93
14957cd0 A	94	// Convert the result to UTF-16.
	95	UChar* utf16Target = result;
	96	const char* utf8Source = utf8Result.get();
	97	conversionResult = convertUTF8ToUTF16(&utf8Source, utf8Source + utf8ResultLength, &utf16Target, utf16Target + resultLength, true);
	98	long utf16ResultLength = utf16Target - result;
	99	if (conversionResult != conversionOK)
ba379fdc	100	*error = true;
ba379fdc	101
14957cd0	102	return utf16ResultLength <= 0 ? -1 : utf16ResultLength;
ba379fdc	103	}
14957cd0	104	int foldCase(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
ba379fdc	105	{
14957cd0 A	106	return convertCase(result, resultLength, src, srcLength, error, g_utf8_casefold);
14957cd0 A	107	}
ba379fdc	108
14957cd0 A	109	int toLower(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
	110	{
	111	return convertCase(result, resultLength, src, srcLength, error, g_utf8_strdown);
	112	}
ba379fdc	113
14957cd0 A	114	int toUpper(UChar* result, int resultLength, const UChar* src, int srcLength, bool* error)
	115	{
	116	return convertCase(result, resultLength, src, srcLength, error, g_utf8_strup);
ba379fdc A	117	}
	118
	119	Direction direction(UChar32 c)
	120	{
	121	PangoBidiType type = pango_bidi_type_for_unichar(c);
	122	switch (type) {
	123	case PANGO_BIDI_TYPE_L:
	124	return LeftToRight;
	125	case PANGO_BIDI_TYPE_R:
	126	return RightToLeft;
	127	case PANGO_BIDI_TYPE_AL:
	128	return RightToLeftArabic;
	129	case PANGO_BIDI_TYPE_LRE:
	130	return LeftToRightEmbedding;
	131	case PANGO_BIDI_TYPE_RLE:
	132	return RightToLeftEmbedding;
	133	case PANGO_BIDI_TYPE_LRO:
	134	return LeftToRightOverride;
	135	case PANGO_BIDI_TYPE_RLO:
	136	return RightToLeftOverride;
	137	case PANGO_BIDI_TYPE_PDF:
	138	return PopDirectionalFormat;
	139	case PANGO_BIDI_TYPE_EN:
	140	return EuropeanNumber;
	141	case PANGO_BIDI_TYPE_AN:
	142	return ArabicNumber;
	143	case PANGO_BIDI_TYPE_ES:
	144	return EuropeanNumberSeparator;
	145	case PANGO_BIDI_TYPE_ET:
	146	return EuropeanNumberTerminator;
	147	case PANGO_BIDI_TYPE_CS:
	148	return CommonNumberSeparator;
	149	case PANGO_BIDI_TYPE_NSM:
	150	return NonSpacingMark;
	151	case PANGO_BIDI_TYPE_BN:
	152	return BoundaryNeutral;
	153	case PANGO_BIDI_TYPE_B:
	154	return BlockSeparator;
	155	case PANGO_BIDI_TYPE_S:
	156	return SegmentSeparator;
	157	case PANGO_BIDI_TYPE_WS:
	158	return WhiteSpaceNeutral;
	159	default:
	160	return OtherNeutral;
	161	}
	162	}
	163
	164	int umemcasecmp(const UChar* a, const UChar* b, int len)
	165	{
	166	GOwnPtr<char> utf8a;
	167	GOwnPtr<char> utf8b;
	168
	169	utf8a.set(g_utf16_to_utf8(a, len, 0, 0, 0));
	170	utf8b.set(g_utf16_to_utf8(b, len, 0, 0, 0));
	171
	172	GOwnPtr<char> foldedA;
	173	GOwnPtr<char> foldedB;
	174
	175	foldedA.set(g_utf8_casefold(utf8a.get(), -1));
	176	foldedB.set(g_utf8_casefold(utf8b.get(), -1));
	177
	178	// FIXME: umemcasecmp needs to mimic u_memcasecmp of icu
	179	// from the ICU docs:
	180	// "Compare two strings case-insensitively using full case folding.
181	// his is equivalent to u_strcmp(u_strFoldCase(s1, n, options), u_strFoldCase(s2, n, options))."
182	//
183	// So it looks like we don't need the full g_utf8_collate here,
184	// but really a bitwise comparison of casefolded unicode chars (not utf-8 bytes).
185	// As there is no direct equivalent to this icu function in GLib, for now
186	// we'll use g_utf8_collate():
187
188	return g_utf8_collate(foldedA.get(), foldedB.get());
189	}
190
191	}
192	}