[apple/icu.git] / icuSources / i18n / inputext.cpp

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (C) 2005-2016, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "inputext.h"

#include "cmemory.h"
#include "cstring.h"

#include <string.h>

U_NAMESPACE_BEGIN

#define BUFFER_SIZE 8192

#define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
#define DELETE_ARRAY(array) uprv_free((void *) (array))

InputText::InputText(UErrorCode &status)
    : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
                                                 //   removed if appropriate.
      fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
                                                 //   Value is percent, not absolute.
      fDeclaredEncoding(0),
      fRawInput(0),
      fRawLength(0)
{
    if (fInputBytes == NULL || fByteStats == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
}

InputText::~InputText()
{
    DELETE_ARRAY(fDeclaredEncoding);
    DELETE_ARRAY(fByteStats);
    DELETE_ARRAY(fInputBytes);
}

void InputText::setText(const char *in, int32_t len)
{
    fInputLen  = 0;
    fC1Bytes   = FALSE;
    fOnlyTypicalASCII = FALSE; // rdar://56373519
    fRawInput  = (const uint8_t *) in;
    fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
}

void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
{
    if(encoding) {
        if (len == -1) {
            len = (int32_t)uprv_strlen(encoding);
        }

        len += 1;     // to make place for the \0 at the end.
        uprv_free(fDeclaredEncoding);
        fDeclaredEncoding = NEW_ARRAY(char, len);
        uprv_strncpy(fDeclaredEncoding, encoding, len);
    }
}

UBool InputText::isSet() const 
{
    return fRawInput != NULL;
}

/**
*  MungeInput - after getting a set of raw input data to be analyzed, preprocess
*               it by removing what appears to be html markup. Currently only used
*               by CharsetDetector::detectAll.
* 
* @internal
*/
void InputText::MungeInput(UBool fStripTags) {
    int     srci = 0;
    int     dsti = 0;
    uint8_t b;
    bool    inMarkup = FALSE;
    bool    inCSSDecl = FALSE;
    int32_t openTags = 0;
    int32_t badTags  = 0;

    //
    //  html / xml markup stripping.
    //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
    //     discard everything within < brackets >
    //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
    //     guess as to whether the input was actually marked up at all.
    // TODO: Think about how this interacts with EBCDIC charsets that are detected.
    if (fStripTags) {
        for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
            b = fRawInput[srci];

            if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
                if (inMarkup) {
                    badTags += 1;
                }
                inMarkup = TRUE;
                openTags += 1;
            }

            if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
                if (inCSSDecl) {
                    badTags += 1;
                }
                inCSSDecl = TRUE;
                openTags += 1;
            }

            if (!inMarkup && !inCSSDecl) {
                fInputBytes[dsti++] = b;
            }

            if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
                inMarkup = FALSE;
            }
            if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
                inCSSDecl = FALSE;
            }
        }

        fInputLen = dsti;
    }

    //
    //  If it looks like this input wasn't marked up, or if it looks like it's
    //    essentially nothing but markup abandon the markup stripping.
    //    Detection will have to work on the unstripped input.
    //
    if (openTags<5 || openTags/5 < badTags || 
        (fInputLen < 100 && fRawLength>600))
    {
        int32_t limit = fRawLength;

        if (limit > BUFFER_SIZE) {
            limit = BUFFER_SIZE;
        }

        for (srci=0; srci<limit; srci++) {
            fInputBytes[srci] = fRawInput[srci];
        }

        fInputLen = srci;
    }

    //
    // Tally up the byte occurence statistics.
    // These are available for use by the various detectors.
    //

    uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);

    for (srci = 0; srci < fInputLen; srci += 1) {
        fByteStats[fInputBytes[srci]] += 1;
    }

    fOnlyTypicalASCII = TRUE; // rdar://56373519
    for (int32_t i = 0x01; i <= 0xFF; i += 1) {
        if (fByteStats[i] != 0) {
            if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) || i > 0x7E) {
                fOnlyTypicalASCII = FALSE; // rdar://56373519
                if (i >= 0x80 && i <= 0x9F) {
                    fC1Bytes = TRUE;
                }
            }
        }
    }
    if (fByteStats[0] > 1) {
        fOnlyTypicalASCII = FALSE;
    }
}

U_NAMESPACE_END
#endif
Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf A	3	/*
73c04bcf A	4	**********************************************************************
2ca993e8	5	* Copyright (C) 2005-2016, International Business Machines
73c04bcf A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_CONVERSION
	13
	14	#include "inputext.h"
	15
	16	#include "cmemory.h"
	17	#include "cstring.h"
	18
	19	#include <string.h>
	20
	21	U_NAMESPACE_BEGIN
	22
	23	#define BUFFER_SIZE 8192
	24
73c04bcf A	25	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
	26	#define DELETE_ARRAY(array) uprv_free((void *) (array))
	27
46f4442e	28	InputText::InputText(UErrorCode &status)
73c04bcf A	29	: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
	30	// removed if appropriate.
	31	fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
	32	// Value is percent, not absolute.
	33	fDeclaredEncoding(0),
	34	fRawInput(0),
	35	fRawLength(0)
46f4442e A	36	{
	37	if (fInputBytes == NULL \|\| fByteStats == NULL) {
	38	status = U_MEMORY_ALLOCATION_ERROR;
	39	}
73c04bcf A	40	}
	41
	42	InputText::~InputText()
	43	{
	44	DELETE_ARRAY(fDeclaredEncoding);
	45	DELETE_ARRAY(fByteStats);
	46	DELETE_ARRAY(fInputBytes);
	47	}
	48
	49	void InputText::setText(const char *in, int32_t len)
	50	{
	51	fInputLen = 0;
	52	fC1Bytes = FALSE;
1a147d09	53	fOnlyTypicalASCII = FALSE; // rdar://56373519
73c04bcf	54	fRawInput = (const uint8_t *) in;
729e4ab9	55	fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
73c04bcf A	56	}
	57
	58	void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
	59	{
	60	if(encoding) {
	61	if (len == -1) {
729e4ab9	62	len = (int32_t)uprv_strlen(encoding);
73c04bcf A	63	}
	64
	65	len += 1; // to make place for the \0 at the end.
	66	uprv_free(fDeclaredEncoding);
	67	fDeclaredEncoding = NEW_ARRAY(char, len);
	68	uprv_strncpy(fDeclaredEncoding, encoding, len);
	69	}
	70	}
	71
	72	UBool InputText::isSet() const
	73	{
	74	return fRawInput != NULL;
	75	}
	76
	77	/**
	78	* MungeInput - after getting a set of raw input data to be analyzed, preprocess
249c4c5e A	79	* it by removing what appears to be html markup. Currently only used
249c4c5e A	80	* by CharsetDetector::detectAll.
73c04bcf A	81	*
	82	* @internal
	83	*/
	84	void InputText::MungeInput(UBool fStripTags) {
	85	int srci = 0;
	86	int dsti = 0;
	87	uint8_t b;
	88	bool inMarkup = FALSE;
249c4c5e	89	bool inCSSDecl = FALSE;
73c04bcf A	90	int32_t openTags = 0;
	91	int32_t badTags = 0;
	92
	93	//
	94	// html / xml markup stripping.
	95	// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
	96	// discard everything within < brackets >
	97	// Count how many total '<' and illegal (nested) '<' occur, so we can make some
	98	// guess as to whether the input was actually marked up at all.
	99	// TODO: Think about how this interacts with EBCDIC charsets that are detected.
	100	if (fStripTags) {
	101	for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
	102	b = fRawInput[srci];
	103
249c4c5e	104	if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
73c04bcf A	105	if (inMarkup) {
	106	badTags += 1;
	107	}
73c04bcf A	108	inMarkup = TRUE;
	109	openTags += 1;
	110	}
	111
249c4c5e A	112	if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
	113	if (inCSSDecl) {
	114	badTags += 1;
	115	}
	116	inCSSDecl = TRUE;
	117	openTags += 1;
	118	}
	119
	120	if (!inMarkup && !inCSSDecl) {
73c04bcf A	121	fInputBytes[dsti++] = b;
	122	}
	123
	124	if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
	125	inMarkup = FALSE;
	126	}
249c4c5e A	127	if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
	128	inCSSDecl = FALSE;
	129	}
73c04bcf A	130	}
	131
	132	fInputLen = dsti;
	133	}
	134
	135	//
	136	// If it looks like this input wasn't marked up, or if it looks like it's
	137	// essentially nothing but markup abandon the markup stripping.
	138	// Detection will have to work on the unstripped input.
	139	//
	140	if (openTags<5 \|\| openTags/5 < badTags \|\|
	141	(fInputLen < 100 && fRawLength>600))
	142	{
	143	int32_t limit = fRawLength;
	144
	145	if (limit > BUFFER_SIZE) {
	146	limit = BUFFER_SIZE;
	147	}
	148
	149	for (srci=0; srci<limit; srci++) {
	150	fInputBytes[srci] = fRawInput[srci];
	151	}
	152
	153	fInputLen = srci;
	154	}
	155
	156	//
	157	// Tally up the byte occurence statistics.
	158	// These are available for use by the various detectors.
	159	//
	160
	161	uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
	162
	163	for (srci = 0; srci < fInputLen; srci += 1) {
	164	fByteStats[fInputBytes[srci]] += 1;
	165	}
	166
1a147d09 A	167	fOnlyTypicalASCII = TRUE; // rdar://56373519
1a147d09 A	168	for (int32_t i = 0x01; i <= 0xFF; i += 1) {
73c04bcf	169	if (fByteStats[i] != 0) {
1a147d09 A	170	if ((i < 0x20 && i != 0x09 && i != 0x0A && i != 0x0D) \|\| i > 0x7E) {
	171	fOnlyTypicalASCII = FALSE; // rdar://56373519
	172	if (i >= 0x80 && i <= 0x9F) {
	173	fC1Bytes = TRUE;
	174	}
	175	}
73c04bcf A	176	}
73c04bcf A	177	}
1a147d09 A	178	if (fByteStats[0] > 1) {
	179	fOnlyTypicalASCII = FALSE;
	180	}
73c04bcf A	181	}
	182
	183	U_NAMESPACE_END
	184	#endif
	185