[apple/icu.git] / icuSources / i18n / inputext.cpp

/*
 **********************************************************************
 *   Copyright (C) 2005-2006, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "inputext.h"

#include "cmemory.h"
#include "cstring.h"

#include <string.h>

U_NAMESPACE_BEGIN

#define BUFFER_SIZE 8192

#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])

#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))

InputText::InputText()
    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                 //   removed if appropriate.
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                 //   Value is percent, not absolute.
      fDeclaredEncoding(0),
      fRawInput(0),
      fRawLength(0)
{  

}

InputText::~InputText()
{
    DELETE_ARRAY(fDeclaredEncoding);
    DELETE_ARRAY(fByteStats);
    DELETE_ARRAY(fInputBytes);
}

void InputText::setText(const char *in, int32_t len)
{
    fInputLen  = 0;
    fC1Bytes   = FALSE;
    fRawInput  = (const uint8_t *) in;
    fRawLength = len == -1? uprv_strlen(in) : len;
}

void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
{
    if(encoding) {
        if (len == -1) {
            len = uprv_strlen(encoding);
        }

        len += 1;     // to make place for the \0 at the end.
        uprv_free(fDeclaredEncoding);
        fDeclaredEncoding = NEW_ARRAY(char, len);
        uprv_strncpy(fDeclaredEncoding, encoding, len);
    }
}

UBool InputText::isSet() const 
{
    return fRawInput != NULL;
}

/**
*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
*               it by removing what appears to be html markup.
* 
* @internal
*/
void InputText::MungeInput(UBool fStripTags) {
    int     srci = 0;
    int     dsti = 0;
    uint8_t b;
    bool    inMarkup = FALSE;
    int32_t openTags = 0;
    int32_t badTags  = 0;

    //
    //  html / xml markup stripping.
    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    //     discard everything within < brackets >
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    //     guess as to whether the input was actually marked up at all.
    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
    if (fStripTags) {
        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
            b = fRawInput[srci];

            if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
                if (inMarkup) {
                    badTags += 1;
                }

                inMarkup = TRUE;
                openTags += 1;
            }

            if (! inMarkup) {
                fInputBytes[dsti++] = b;
            }

            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
                inMarkup = FALSE;
            }
        }

        fInputLen = dsti;
    }

    //
    //  If it looks like this input wasn't marked up, or if it looks like it's
    //    essentially nothing but markup abandon the markup stripping.
    //    Detection will have to work on the unstripped input.
    //
    if (openTags<5 || openTags/5 < badTags || 
        (fInputLen < 100 && fRawLength>600))
    {
        int32_t limit = fRawLength;

        if (limit > BUFFER_SIZE) {
            limit = BUFFER_SIZE;
        }

        for (srci=0; srci<limit; srci++) {
            fInputBytes[srci] = fRawInput[srci];
        }

        fInputLen = srci;
    }

    //
    // Tally up the byte occurence statistics.
    // These are available for use by the various detectors.
    //

    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

    for (srci = 0; srci < fInputLen; srci += 1) {
        fByteStats[fInputBytes[srci]] += 1;
    }

    for (int32_t i = 0x80; i <= 0x9F; i += 1) {
        if (fByteStats[i] != 0) {
            fC1Bytes = TRUE;
            break;
        }
    }
}

U_NAMESPACE_END
#endif
Commit	Line	Data
73c04bcf A	1	/*
	2	**********************************************************************
	3	* Copyright (C) 2005-2006, International Business Machines
	4	* Corporation and others. All Rights Reserved.
	5	**********************************************************************
	6	*/
	7
	8	#include "unicode/utypes.h"
	9
	10	#if !UCONFIG_NO_CONVERSION
	11
	12	#include "inputext.h"
	13
	14	#include "cmemory.h"
	15	#include "cstring.h"
	16
	17	#include <string.h>
	18
	19	U_NAMESPACE_BEGIN
	20
	21	#define BUFFER_SIZE 8192
	22
	23	#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
	24
	25	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
	26	#define DELETE_ARRAY(array) uprv_free((void *) (array))
	27
	28	InputText::InputText()
	29	: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
	30	// removed if appropriate.
	31	fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
	32	// Value is percent, not absolute.
	33	fDeclaredEncoding(0),
	34	fRawInput(0),
	35	fRawLength(0)
	36	{
	37
	38	}
	39
	40	InputText::~InputText()
	41	{
	42	DELETE_ARRAY(fDeclaredEncoding);
	43	DELETE_ARRAY(fByteStats);
	44	DELETE_ARRAY(fInputBytes);
	45	}
	46
	47	void InputText::setText(const char *in, int32_t len)
	48	{
	49	fInputLen = 0;
	50	fC1Bytes = FALSE;
	51	fRawInput = (const uint8_t *) in;
	52	fRawLength = len == -1? uprv_strlen(in) : len;
	53	}
	54
	55	void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
	56	{
	57	if(encoding) {
	58	if (len == -1) {
	59	len = uprv_strlen(encoding);
	60	}
	61
	62	len += 1; // to make place for the \0 at the end.
	63	uprv_free(fDeclaredEncoding);
	64	fDeclaredEncoding = NEW_ARRAY(char, len);
65	uprv_strncpy(fDeclaredEncoding, encoding, len);
66	}
67	}
68
69	UBool InputText::isSet() const
70	{
71	return fRawInput != NULL;
72	}
73
74	/**
75	* MungeInput - after getting a set of raw input data to be analyzed, preprocess
76	* it by removing what appears to be html markup.
77	*
78	* @internal
79	*/
80	void InputText::MungeInput(UBool fStripTags) {
81	int srci = 0;
82	int dsti = 0;
83	uint8_t b;
84	bool inMarkup = FALSE;
85	int32_t openTags = 0;
86	int32_t badTags = 0;
87
88	//
89	// html / xml markup stripping.
90	// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
91	// discard everything within < brackets >
92	// Count how many total '<' and illegal (nested) '<' occur, so we can make some
93	// guess as to whether the input was actually marked up at all.
94	// TODO: Think about how this interacts with EBCDIC charsets that are detected.
95	if (fStripTags) {
96	for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
97	b = fRawInput[srci];
98
99	if (b == (uint8_t)0x3C) { /* Check for the ASCII '<' */
100	if (inMarkup) {
101	badTags += 1;
102	}
103
104	inMarkup = TRUE;
105	openTags += 1;
106	}
107
108	if (! inMarkup) {
109	fInputBytes[dsti++] = b;
110	}
111
112	if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
113	inMarkup = FALSE;
114	}
115	}
116
117	fInputLen = dsti;
118	}
119
120	//
121	// If it looks like this input wasn't marked up, or if it looks like it's
122	// essentially nothing but markup abandon the markup stripping.
123	// Detection will have to work on the unstripped input.
124	//
125	if (openTags<5 \|\| openTags/5 < badTags \|\|
126	(fInputLen < 100 && fRawLength>600))
127	{
128	int32_t limit = fRawLength;
129
130	if (limit > BUFFER_SIZE) {
131	limit = BUFFER_SIZE;
132	}
133
134	for (srci=0; srci<limit; srci++) {
135	fInputBytes[srci] = fRawInput[srci];
136	}
137
138	fInputLen = srci;
139	}
140
141	//
142	// Tally up the byte occurence statistics.
143	// These are available for use by the various detectors.
144	//
145
146	uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
147
148	for (srci = 0; srci < fInputLen; srci += 1) {
149	fByteStats[fInputBytes[srci]] += 1;
150	}
151
152	for (int32_t i = 0x80; i <= 0x9F; i += 1) {
153	if (fByteStats[i] != 0) {
154	fC1Bytes = TRUE;
155	break;
156	}
157	}
158	}
159
160	U_NAMESPACE_END
161	#endif
162