[apple/icu.git] / icuSources / i18n / csrmbcs.cpp

/*
 **********************************************************************
 *   Copyright (C) 2005-2008, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "csrmbcs.h"

#include <math.h>

U_NAMESPACE_BEGIN

#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

#define min(x,y) (((x)<(y))?(x):(y))

static const uint16_t commonChars_sjis [] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};

static const uint16_t commonChars_euc_jp[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};

static const uint16_t commonChars_euc_kr[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};

static const uint16_t commonChars_big5[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};

static const uint16_t commonChars_gb_18030[] = {
// TODO:  This set of data comes from the character frequency-
//        of-occurence analysis tool.  The data needs to be moved
//        into a resource and loaded from there.
0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};

static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
{
    int32_t start = 0, end = len-1;
    int32_t mid = (start+end)/2;

    while(start <= end) {
        if(array[mid] == value) {
            return mid;
        }

        if(array[mid] < value){
            start = mid+1;
        } else {
            end = mid-1;
        }

        mid = (start+end)/2;
    }

    return -1;
}

IteratedChar::IteratedChar() : 
charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
{
    // nothing else to do.
}

/*void IteratedChar::reset()
{
    charValue = 0;
    index     = -1;
    nextIndex = 0;
    error     = FALSE;
    done      = FALSE;
}*/

int32_t IteratedChar::nextByte(InputText *det)
{
    if (nextIndex >= det->fRawLength) {
        done = TRUE;

        return -1;
    }

    return det->fRawInput[nextIndex++];
}

CharsetRecog_mbcs::~CharsetRecog_mbcs()
{
    // nothing to do.
}

int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
    int32_t singleByteCharCount = 0;
    int32_t doubleByteCharCount = 0;
    int32_t commonCharCount     = 0;
    int32_t badCharCount        = 0;
    int32_t totalCharCount      = 0;
    int32_t confidence          = 0;
    IteratedChar iter;

    while (nextChar(&iter, det)) {
        totalCharCount++;

        if (iter.error) {
            badCharCount++;
        } else {
            if (iter.charValue <= 0xFF) {
                singleByteCharCount++;
            } else {
                doubleByteCharCount++;

                if (commonChars != 0) {
                    if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
                        commonCharCount += 1;
                    }
                }
            }
        }


        if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
            // Bail out early if the byte data is not matching the encoding scheme.
            // break detectBlock;
            return confidence;
        }
    }

    if (doubleByteCharCount <= 10 && badCharCount == 0) {
        // Not many multi-byte chars.
        if (doubleByteCharCount == 0 && totalCharCount < 10) {
            // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
            // We don't have enough data to have any confidence.
            // Statistical analysis of single byte non-ASCII charcters would probably help here.
            confidence = 0;
        }
        else {
            //   ASCII or ISO file?  It's probably not our encoding,
            //   but is not incompatible with our encoding, so don't give it a zero.
            confidence = 10;
        }

        return confidence;
    }

    //
    //  No match if there are too many characters that don't fit the encoding scheme.
    //    (should we have zero tolerance for these?)
    //
    if (doubleByteCharCount < 20*badCharCount) {
        confidence = 0;

        return confidence;
    }

    if (commonChars == 0) {
        // We have no statistics on frequently occuring characters.
        //  Assess confidence purely on having a reasonable number of
        //  multi-byte characters (the more the better)
        confidence = 30 + doubleByteCharCount - 20*badCharCount;

        if (confidence > 100) {
            confidence = 100;
        }
    } else {
        //
        // Frequency of occurence statistics exist.
        //

        double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/
        double scaleFactor = 90.0 / maxVal;
        confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);

        confidence = min(confidence, 100);
    }

    if (confidence < 0) {
        confidence = 0;
    }

    return confidence;
}

CharsetRecog_sjis::~CharsetRecog_sjis()
{
    // nothing to do
}

UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
    it->index = it->nextIndex;
    it->error = FALSE;

    int32_t firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (firstByte << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
        // Illegal second byte value.
        it->error = TRUE;
    }

    return TRUE;
}

int32_t CharsetRecog_sjis::match(InputText* det)
{
    return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
}

const char *CharsetRecog_sjis::getName() const
{
    return "Shift_JIS";
}

const char *CharsetRecog_sjis::getLanguage() const
{
    return "ja";
}

CharsetRecog_euc::~CharsetRecog_euc()
{
    // nothing to do
}

UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x8D) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0xA1 && firstByte <= 0xFE) {
        // Two byte Char
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8E) {
        // Code Set 2.
        //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
        //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
        // We don't know which we've got.
        // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
        //   bytes will look like a well formed 2 byte char.
        if (secondByte < 0xA1) {
            it->error = TRUE;
        }

        return TRUE;
    }

    if (firstByte == 0x8F) {
        // Code set 3.
        // Three byte total char size, two bytes of actual char value.
        thirdByte    = it->nextByte(det);
        it->charValue = (it->charValue << 8) | thirdByte;

        if (thirdByte < 0xa1) {
            // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
            it->error = TRUE;
        }
    }

    return TRUE;

}

CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
{
    // nothing to do
}

const char *CharsetRecog_euc_jp::getName() const
{
    return "EUC-JP";
}

const char *CharsetRecog_euc_jp::getLanguage() const
{
    return "ja";
}

int32_t CharsetRecog_euc_jp::match(InputText *det)
{
    return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
}

CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
{
    // nothing to do
}

const char *CharsetRecog_euc_kr::getName() const
{
    return "EUC-KR";
}

const char *CharsetRecog_euc_kr::getLanguage() const
{
    return "ko";
}

int32_t CharsetRecog_euc_kr::match(InputText *det)
{
    return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
}

CharsetRecog_big5::~CharsetRecog_big5()
{
    // nothing to do
}

UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
{
    int32_t firstByte;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        return FALSE;
    }

    if (firstByte <= 0x7F || firstByte == 0xFF) {
        // single byte character.
        return TRUE;
    }

    int32_t secondByte = it->nextByte(det);
    if (secondByte >= 0)  {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_big5::getName() const
{
    return "Big5";
}

const char *CharsetRecog_big5::getLanguage() const
{
    return "zh";
}

int32_t CharsetRecog_big5::match(InputText *det)
{
    return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
}

CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
{
    // nothing to do
}

UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
    int32_t firstByte  = 0;
    int32_t secondByte = 0;
    int32_t thirdByte  = 0;
    int32_t fourthByte = 0;

    it->index = it->nextIndex;
    it->error = FALSE;
    firstByte = it->charValue = it->nextByte(det);

    if (firstByte < 0) {
        // Ran off the end of the input data
        return FALSE;
    }

    if (firstByte <= 0x80) {
        // single byte char
        return TRUE;
    }

    secondByte = it->nextByte(det);
    if (secondByte >= 0) {
        it->charValue = (it->charValue << 8) | secondByte;
    }
    // else we'll handle the error later.

    if (firstByte >= 0x81 && firstByte <= 0xFE) {
        // Two byte Char
        if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
            return TRUE;
        }

        // Four byte char
        if (secondByte >= 0x30 && secondByte <= 0x39) {
            thirdByte = it->nextByte(det);

            if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
                fourthByte = it->nextByte(det);

                if (fourthByte >= 0x30 && fourthByte <= 0x39) {
                    it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;

                    return TRUE;
                }
            }
        }

        // Something wasn't valid, or we ran out of data (-1).
        it->error = TRUE;
    }

    return TRUE;
}

const char *CharsetRecog_gb_18030::getName() const
{
    return "GB18030";
}

const char *CharsetRecog_gb_18030::getLanguage() const
{
    return "zh";
}

int32_t CharsetRecog_gb_18030::match(InputText *det)
{
    return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
}

U_NAMESPACE_END
#endif
Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	**********************************************************************
46f4442e	3	* Copyright (C) 2005-2008, International Business Machines
73c04bcf A	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#if !UCONFIG_NO_CONVERSION
	11
	12	#include "csrmbcs.h"
	13
	14	#include <math.h>
	15
	16	U_NAMESPACE_BEGIN
	17
	18	#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
	19
	20	#define min(x,y) (((x)<(y))?(x):(y))
	21
46f4442e	22	static const uint16_t commonChars_sjis [] = {
73c04bcf A	23	// TODO: This set of data comes from the character frequency-
	24	// of-occurence analysis tool. The data needs to be moved
	25	// into a resource and loaded from there.
	26	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
	27	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
	28	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
	29	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
	30	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
	31	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
	32
46f4442e	33	static const uint16_t commonChars_euc_jp[] = {
73c04bcf A	34	// TODO: This set of data comes from the character frequency-
	35	// of-occurence analysis tool. The data needs to be moved
	36	// into a resource and loaded from there.
	37	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
	38	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
	39	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
	40	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
	41	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
	42	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
	43	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
	44	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
	45	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
	46	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
	47
46f4442e	48	static const uint16_t commonChars_euc_kr[] = {
73c04bcf A	49	// TODO: This set of data comes from the character frequency-
	50	// of-occurence analysis tool. The data needs to be moved
	51	// into a resource and loaded from there.
	52	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
	53	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
	54	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
	55	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
	56	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
	57	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
	58	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
	59	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
	60	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
	61	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
	62
46f4442e	63	static const uint16_t commonChars_big5[] = {
73c04bcf A	64	// TODO: This set of data comes from the character frequency-
	65	// of-occurence analysis tool. The data needs to be moved
	66	// into a resource and loaded from there.
	67	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
	68	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
	69	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
	70	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
	71	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
	72	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
	73	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
	74	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
	75	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
	76	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
	77
46f4442e	78	static const uint16_t commonChars_gb_18030[] = {
73c04bcf A	79	// TODO: This set of data comes from the character frequency-
	80	// of-occurence analysis tool. The data needs to be moved
	81	// into a resource and loaded from there.
	82	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
	83	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
	84	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
	85	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
	86	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
	87	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
	88	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
	89	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
	90	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
	91	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
	92
46f4442e	93	static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
73c04bcf A	94	{
	95	int32_t start = 0, end = len-1;
	96	int32_t mid = (start+end)/2;
	97
	98	while(start <= end) {
	99	if(array[mid] == value) {
	100	return mid;
	101	}
	102
	103	if(array[mid] < value){
	104	start = mid+1;
	105	} else {
	106	end = mid-1;
	107	}
	108
	109	mid = (start+end)/2;
	110	}
	111
	112	return -1;
	113	}
	114
46f4442e A	115	IteratedChar::IteratedChar() :
46f4442e A	116	charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
73c04bcf A	117	{
	118	// nothing else to do.
	119	}
	120
46f4442e	121	/*void IteratedChar::reset()
73c04bcf A	122	{
	123	charValue = 0;
	124	index = -1;
	125	nextIndex = 0;
	126	error = FALSE;
	127	done = FALSE;
46f4442e	128	}*/
73c04bcf A	129
	130	int32_t IteratedChar::nextByte(InputText *det)
	131	{
	132	if (nextIndex >= det->fRawLength) {
	133	done = TRUE;
	134
	135	return -1;
	136	}
	137
	138	return det->fRawInput[nextIndex++];
	139	}
	140
	141	CharsetRecog_mbcs::~CharsetRecog_mbcs()
	142	{
	143	// nothing to do.
	144	}
	145
46f4442e A	146	int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
	147	int32_t singleByteCharCount = 0;
	148	int32_t doubleByteCharCount = 0;
	149	int32_t commonCharCount = 0;
	150	int32_t badCharCount = 0;
	151	int32_t totalCharCount = 0;
	152	int32_t confidence = 0;
	153	IteratedChar iter;
	154
	155	while (nextChar(&iter, det)) {
	156	totalCharCount++;
	157
	158	if (iter.error) {
	159	badCharCount++;
73c04bcf	160	} else {
46f4442e A	161	if (iter.charValue <= 0xFF) {
46f4442e A	162	singleByteCharCount++;
73c04bcf	163	} else {
46f4442e	164	doubleByteCharCount++;
73c04bcf A	165
73c04bcf A	166	if (commonChars != 0) {
46f4442e	167	if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
73c04bcf A	168	commonCharCount += 1;
	169	}
	170	}
	171	}
	172	}
	173
	174
	175	if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
	176	// Bail out early if the byte data is not matching the encoding scheme.
	177	// break detectBlock;
73c04bcf A	178	return confidence;
	179	}
	180	}
	181
73c04bcf A	182	if (doubleByteCharCount <= 10 && badCharCount == 0) {
73c04bcf A	183	// Not many multi-byte chars.
46f4442e A	184	if (doubleByteCharCount == 0 && totalCharCount < 10) {
	185	// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
	186	// We don't have enough data to have any confidence.
	187	// Statistical analysis of single byte non-ASCII charcters would probably help here.
	188	confidence = 0;
	189	}
	190	else {
	191	// ASCII or ISO file? It's probably not our encoding,
	192	// but is not incompatible with our encoding, so don't give it a zero.
	193	confidence = 10;
	194	}
73c04bcf A	195
	196	return confidence;
	197	}
	198
	199	//
	200	// No match if there are too many characters that don't fit the encoding scheme.
	201	// (should we have zero tolerance for these?)
	202	//
	203	if (doubleByteCharCount < 20*badCharCount) {
	204	confidence = 0;
	205
	206	return confidence;
	207	}
	208
	209	if (commonChars == 0) {
	210	// We have no statistics on frequently occuring characters.
	211	// Assess confidence purely on having a reasonable number of
	212	// multi-byte characters (the more the better)
	213	confidence = 30 + doubleByteCharCount - 20*badCharCount;
	214
	215	if (confidence > 100) {
	216	confidence = 100;
	217	}
	218	} else {
	219	//
	220	// Frequency of occurence statistics exist.
	221	//
	222
	223	double maxVal = log10((double)doubleByteCharCount / 4); /(float)?/
	224	double scaleFactor = 90.0 / maxVal;
	225	confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0);
	226
	227	confidence = min(confidence, 100);
	228	}
	229
	230	if (confidence < 0) {
	231	confidence = 0;
	232	}
	233
	234	return confidence;
	235	}
	236
	237	CharsetRecog_sjis::~CharsetRecog_sjis()
	238	{
	239	// nothing to do
	240	}
	241
	242	UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
	243	it->index = it->nextIndex;
	244	it->error = FALSE;
	245
	246	int32_t firstByte = it->charValue = it->nextByte(det);
	247
	248	if (firstByte < 0) {
	249	return FALSE;
	250	}
	251
	252	if (firstByte <= 0x7F \|\| (firstByte > 0xA0 && firstByte <= 0xDF)) {
	253	return TRUE;
	254	}
	255
	256	int32_t secondByte = it->nextByte(det);
46f4442e A	257	if (secondByte >= 0) {
46f4442e A	258	it->charValue = (firstByte << 8) \| secondByte;
73c04bcf	259	}
46f4442e A	260	// else we'll handle the error later.
46f4442e A	261
73c04bcf A	262	if (! ((secondByte >= 0x40 && secondByte <= 0x7F) \|\| (secondByte >= 0x80 && secondByte <= 0xFE))) {
	263	// Illegal second byte value.
	264	it->error = TRUE;
	265	}
	266
	267	return TRUE;
	268	}
	269
	270	int32_t CharsetRecog_sjis::match(InputText* det)
	271	{
	272	return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
	273	}
	274
	275	const char *CharsetRecog_sjis::getName() const
	276	{
	277	return "Shift_JIS";
	278	}
	279
	280	const char *CharsetRecog_sjis::getLanguage() const
	281	{
	282	return "ja";
	283	}
	284
	285	CharsetRecog_euc::~CharsetRecog_euc()
	286	{
	287	// nothing to do
	288	}
	289
	290	UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
	291	int32_t firstByte = 0;
	292	int32_t secondByte = 0;
	293	int32_t thirdByte = 0;
73c04bcf A	294
	295	it->index = it->nextIndex;
	296	it->error = FALSE;
	297	firstByte = it->charValue = it->nextByte(det);
	298
	299	if (firstByte < 0) {
	300	// Ran off the end of the input data
46f4442e	301	return FALSE;
73c04bcf A	302	}
	303
	304	if (firstByte <= 0x8D) {
	305	// single byte char
46f4442e	306	return TRUE;
73c04bcf A	307	}
	308
	309	secondByte = it->nextByte(det);
46f4442e A	310	if (secondByte >= 0) {
	311	it->charValue = (it->charValue << 8) \| secondByte;
	312	}
	313	// else we'll handle the error later.
73c04bcf A	314
	315	if (firstByte >= 0xA1 && firstByte <= 0xFE) {
	316	// Two byte Char
	317	if (secondByte < 0xA1) {
	318	it->error = TRUE;
	319	}
	320
46f4442e	321	return TRUE;
73c04bcf A	322	}
	323
	324	if (firstByte == 0x8E) {
	325	// Code Set 2.
	326	// In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
	327	// In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
	328	// We don't know which we've got.
	329	// Treat it like EUC-JP. If the data really was EUC-TW, the following two
	330	// bytes will look like a well formed 2 byte char.
	331	if (secondByte < 0xA1) {
	332	it->error = TRUE;
	333	}
	334
46f4442e	335	return TRUE;
73c04bcf A	336	}
	337
	338	if (firstByte == 0x8F) {
	339	// Code set 3.
	340	// Three byte total char size, two bytes of actual char value.
	341	thirdByte = it->nextByte(det);
	342	it->charValue = (it->charValue << 8) \| thirdByte;
	343
	344	if (thirdByte < 0xa1) {
46f4442e	345	// Bad second byte or ran off the end of the input data with a non-ASCII first byte.
73c04bcf A	346	it->error = TRUE;
	347	}
	348	}
	349
46f4442e	350	return TRUE;
73c04bcf A	351
	352	}
	353
	354	CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
	355	{
	356	// nothing to do
	357	}
	358
	359	const char *CharsetRecog_euc_jp::getName() const
	360	{
	361	return "EUC-JP";
	362	}
	363
	364	const char *CharsetRecog_euc_jp::getLanguage() const
	365	{
	366	return "ja";
	367	}
	368
	369	int32_t CharsetRecog_euc_jp::match(InputText *det)
	370	{
	371	return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
	372	}
	373
	374	CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
	375	{
	376	// nothing to do
	377	}
	378
	379	const char *CharsetRecog_euc_kr::getName() const
	380	{
	381	return "EUC-KR";
	382	}
	383
	384	const char *CharsetRecog_euc_kr::getLanguage() const
	385	{
	386	return "ko";
	387	}
	388
	389	int32_t CharsetRecog_euc_kr::match(InputText *det)
	390	{
	391	return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
	392	}
	393
	394	CharsetRecog_big5::~CharsetRecog_big5()
	395	{
	396	// nothing to do
	397	}
	398
	399	UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
	400	{
	401	int32_t firstByte;
	402
	403	it->index = it->nextIndex;
	404	it->error = FALSE;
	405	firstByte = it->charValue = it->nextByte(det);
	406
	407	if (firstByte < 0) {
	408	return FALSE;
	409	}
	410
	411	if (firstByte <= 0x7F \|\| firstByte == 0xFF) {
	412	// single byte character.
	413	return TRUE;
	414	}
415
416	int32_t secondByte = it->nextByte(det);
46f4442e A	417	if (secondByte >= 0) {
46f4442e A	418	it->charValue = (it->charValue << 8) \| secondByte;
73c04bcf	419	}
46f4442e	420	// else we'll handle the error later.
73c04bcf	421
46f4442e A	422	if (secondByte < 0x40 \|\| secondByte == 0x7F \|\| secondByte == 0xFF) {
46f4442e A	423	it->error = TRUE;
73c04bcf A	424	}
	425
	426	return TRUE;
	427	}
	428
	429	const char *CharsetRecog_big5::getName() const
	430	{
	431	return "Big5";
	432	}
	433
	434	const char *CharsetRecog_big5::getLanguage() const
	435	{
	436	return "zh";
	437	}
	438
	439	int32_t CharsetRecog_big5::match(InputText *det)
	440	{
	441	return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
	442	}
	443
	444	CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
	445	{
	446	// nothing to do
	447	}
	448
	449	UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
	450	int32_t firstByte = 0;
	451	int32_t secondByte = 0;
	452	int32_t thirdByte = 0;
	453	int32_t fourthByte = 0;
	454
	455	it->index = it->nextIndex;
	456	it->error = FALSE;
	457	firstByte = it->charValue = it->nextByte(det);
	458
	459	if (firstByte < 0) {
	460	// Ran off the end of the input data
46f4442e	461	return FALSE;
73c04bcf A	462	}
	463
	464	if (firstByte <= 0x80) {
	465	// single byte char
46f4442e	466	return TRUE;
73c04bcf A	467	}
	468
	469	secondByte = it->nextByte(det);
46f4442e A	470	if (secondByte >= 0) {
	471	it->charValue = (it->charValue << 8) \| secondByte;
	472	}
	473	// else we'll handle the error later.
73c04bcf A	474
	475	if (firstByte >= 0x81 && firstByte <= 0xFE) {
	476	// Two byte Char
	477	if ((secondByte >= 0x40 && secondByte <= 0x7E) \|\| (secondByte >=80 && secondByte <= 0xFE)) {
46f4442e	478	return TRUE;
73c04bcf A	479	}
	480
	481	// Four byte char
	482	if (secondByte >= 0x30 && secondByte <= 0x39) {
	483	thirdByte = it->nextByte(det);
	484
	485	if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
	486	fourthByte = it->nextByte(det);
	487
	488	if (fourthByte >= 0x30 && fourthByte <= 0x39) {
	489	it->charValue = (it->charValue << 16) \| (thirdByte << 8) \| fourthByte;
	490
46f4442e	491	return TRUE;
73c04bcf A	492	}
	493	}
	494	}
	495
46f4442e	496	// Something wasn't valid, or we ran out of data (-1).
73c04bcf	497	it->error = TRUE;
73c04bcf A	498	}
73c04bcf A	499
46f4442e	500	return TRUE;
73c04bcf A	501	}
	502
	503	const char *CharsetRecog_gb_18030::getName() const
	504	{
	505	return "GB18030";
	506	}
	507
	508	const char *CharsetRecog_gb_18030::getLanguage() const
	509	{
	510	return "zh";
	511	}
	512
	513	int32_t CharsetRecog_gb_18030::match(InputText *det)
	514	{
	515	return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
	516	}
	517
	518	U_NAMESPACE_END
	519	#endif