[apple/icu.git] / icuSources / i18n / csrucode.cpp

/*
 **********************************************************************
 *   Copyright (C) 2005-2006, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "csrucode.h"

U_NAMESPACE_BEGIN

CharsetRecog_Unicode::~CharsetRecog_Unicode()
{
    // nothing to do
}

CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
{
    // nothing to do
}

const char *CharsetRecog_UTF_16_BE::getName() const
{
    return "UTF-16BE";
}

int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn)
{
    const uint8_t *input = textIn->fRawInput;

    if (input[0] == 0xFE && input[1] == 0xFF) {
        return 100;
    }

    // TODO: Do some statastics to check for unsigned UTF-16BE
    return 0;
}

CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
{
    // nothing to do
}

const char *CharsetRecog_UTF_16_LE::getName() const
{
    return "UTF-16LE";
}

int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn)
{
    const uint8_t *input = textIn->fRawInput;

    if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
        return 100;
    }

    // TODO: Do some statastics to check for unsigned UTF-16LE
    return 0;
}

CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
{
    // nothing to do
}

int32_t CharsetRecog_UTF_32::match(InputText* textIn)
{
    const uint8_t *input = textIn->fRawInput;
    int32_t limit = (textIn->fRawLength / 4) * 4;
    int32_t numValid = 0;
    int32_t numInvalid = 0;
    bool hasBOM = FALSE;
    int32_t confidence = 0;

    if (getChar(input, 0) == 0x0000FEFFUL) {
        hasBOM = TRUE;
    }

    for(int32_t i = 0; i < limit; i += 4) {
        int32_t ch = getChar(input, i);

        if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
            numInvalid += 1;
        } else {
            numValid += 1;
        }
    }


    // Cook up some sort of confidence score, based on presense of a BOM
    //    and the existence of valid and/or invalid multi-byte sequences.
    if (hasBOM && numInvalid==0) {
        confidence = 100;
    } else if (hasBOM && numValid > numInvalid*10) {
        confidence = 80;
    } else if (numValid > 3 && numInvalid == 0) {
        confidence = 100;            
    } else if (numValid > 0 && numInvalid == 0) {
        confidence = 80;
    } else if (numValid > numInvalid*10) {
        // Probably corruput UTF-32BE data.  Valid sequences aren't likely by chance.
        confidence = 25;
    }

    return confidence;
}

CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
{
    // nothing to do
}

const char *CharsetRecog_UTF_32_BE::getName() const
{
    return "UTF-32BE";
}

int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
{
    return input[index + 0] << 24 | input[index + 1] << 16 |
           input[index + 2] <<  8 | input[index + 3];
} 

CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
{
    // nothing to do
}

const char *CharsetRecog_UTF_32_LE::getName() const
{
    return "UTF-32LE";
}

int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
{
    return input[index + 3] << 24 | input[index + 2] << 16 |
           input[index + 1] <<  8 | input[index + 0];
}

U_NAMESPACE_END
#endif
Commit	Line	Data
73c04bcf A	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2005-2006, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#if !UCONFIG_NO_CONVERSION
	11
	12	#include "csrucode.h"
	13
	14	U_NAMESPACE_BEGIN
	15
	16	CharsetRecog_Unicode::~CharsetRecog_Unicode()
	17	{
	18	// nothing to do
	19	}
	20
	21	CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
	22	{
	23	// nothing to do
	24	}
	25
	26	const char *CharsetRecog_UTF_16_BE::getName() const
	27	{
	28	return "UTF-16BE";
	29	}
	30
	31	int32_t CharsetRecog_UTF_16_BE::match(InputText* textIn)
	32	{
	33	const uint8_t *input = textIn->fRawInput;
	34
	35	if (input[0] == 0xFE && input[1] == 0xFF) {
	36	return 100;
	37	}
	38
	39	// TODO: Do some statastics to check for unsigned UTF-16BE
	40	return 0;
	41	}
	42
	43	CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
	44	{
	45	// nothing to do
	46	}
	47
	48	const char *CharsetRecog_UTF_16_LE::getName() const
	49	{
	50	return "UTF-16LE";
	51	}
	52
	53	int32_t CharsetRecog_UTF_16_LE::match(InputText* textIn)
	54	{
	55	const uint8_t *input = textIn->fRawInput;
	56
	57	if (input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 \|\| input[3] != 0x00)) {
	58	return 100;
	59	}
	60
	61	// TODO: Do some statastics to check for unsigned UTF-16LE
	62	return 0;
	63	}
	64
65	CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
66	{
67	// nothing to do
68	}
69
70	int32_t CharsetRecog_UTF_32::match(InputText* textIn)
71	{
72	const uint8_t *input = textIn->fRawInput;
73	int32_t limit = (textIn->fRawLength / 4) * 4;
74	int32_t numValid = 0;
75	int32_t numInvalid = 0;
76	bool hasBOM = FALSE;
77	int32_t confidence = 0;
78
79	if (getChar(input, 0) == 0x0000FEFFUL) {
80	hasBOM = TRUE;
81	}
82
83	for(int32_t i = 0; i < limit; i += 4) {
84	int32_t ch = getChar(input, i);
85
86	if (ch < 0 \|\| ch >= 0x10FFFF \|\| (ch >= 0xD800 && ch <= 0xDFFF)) {
87	numInvalid += 1;
88	} else {
89	numValid += 1;
90	}
91	}
92
93
94	// Cook up some sort of confidence score, based on presense of a BOM
95	// and the existence of valid and/or invalid multi-byte sequences.
96	if (hasBOM && numInvalid==0) {
97	confidence = 100;
98	} else if (hasBOM && numValid > numInvalid*10) {
99	confidence = 80;
100	} else if (numValid > 3 && numInvalid == 0) {
101	confidence = 100;
102	} else if (numValid > 0 && numInvalid == 0) {
103	confidence = 80;
104	} else if (numValid > numInvalid*10) {
105	// Probably corruput UTF-32BE data. Valid sequences aren't likely by chance.
106	confidence = 25;
107	}
108
109	return confidence;
110	}
111
112	CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
113	{
114	// nothing to do
115	}
116
117	const char *CharsetRecog_UTF_32_BE::getName() const
118	{
119	return "UTF-32BE";
120	}
121
122	int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
123	{
124	return input[index + 0] << 24 \| input[index + 1] << 16 \|
125	input[index + 2] << 8 \| input[index + 3];
126	}
127
128	CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
129	{
130	// nothing to do
131	}
132
133	const char *CharsetRecog_UTF_32_LE::getName() const
134	{
135	return "UTF-32LE";
136	}
137
138	int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
139	{
140	return input[index + 3] << 24 \| input[index + 2] << 16 \|
141	input[index + 1] << 8 \| input[index + 0];
142	}
143
144	U_NAMESPACE_END
145	#endif
146