git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	**********************************************************************
	5	* Copyright (C) 2005-2016, International Business Machines
	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_CONVERSION
	13
	14	#include "inputext.h"
	15
	16	#include "cmemory.h"
	17	#include "cstring.h"
	18
	19	#include <string.h>
	20
	21	U_NAMESPACE_BEGIN
	22
	23	#define BUFFER_SIZE 8192
	24
	25	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
	26	#define DELETE_ARRAY(array) uprv_free((void *) (array))
	27
	28	InputText::InputText(UErrorCode &status)
	29	: fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked. Markup will have been
	30	// removed if appropriate.
	31	fByteStats(NEW_ARRAY(int16_t, 256)), // byte frequency statistics for the input text.
	32	// Value is percent, not absolute.
	33	fDeclaredEncoding(0),
	34	fRawInput(0),
	35	fRawLength(0)
	36	{
	37	if (fInputBytes == NULL \|\| fByteStats == NULL) {
	38	status = U_MEMORY_ALLOCATION_ERROR;
	39	}
	40	}
	41
	42	InputText::~InputText()
	43	{
	44	DELETE_ARRAY(fDeclaredEncoding);
	45	DELETE_ARRAY(fByteStats);
	46	DELETE_ARRAY(fInputBytes);
	47	}
	48
	49	void InputText::setText(const char *in, int32_t len)
	50	{
	51	fInputLen = 0;
	52	fC1Bytes = FALSE;
	53	fRawInput = (const uint8_t *) in;
	54	fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
	55	}
	56
	57	void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
	58	{
	59	if(encoding) {
	60	if (len == -1) {
	61	len = (int32_t)uprv_strlen(encoding);
	62	}
	63
	64	len += 1; // to make place for the \0 at the end.
	65	uprv_free(fDeclaredEncoding);
	66	fDeclaredEncoding = NEW_ARRAY(char, len);
	67	uprv_strncpy(fDeclaredEncoding, encoding, len);
	68	}
	69	}
	70
	71	UBool InputText::isSet() const
	72	{
	73	return fRawInput != NULL;
	74	}
	75
	76	/**
	77	* MungeInput - after getting a set of raw input data to be analyzed, preprocess
	78	* it by removing what appears to be html markup. Currently only used
	79	* by CharsetDetector::detectAll.
	80	*
	81	* @internal
	82	*/
	83	void InputText::MungeInput(UBool fStripTags) {
	84	int srci = 0;
	85	int dsti = 0;
	86	uint8_t b;
	87	bool inMarkup = FALSE;
	88	bool inCSSDecl = FALSE;
	89	int32_t openTags = 0;
	90	int32_t badTags = 0;
	91
	92	//
	93	// html / xml markup stripping.
	94	// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
	95	// discard everything within < brackets >
	96	// Count how many total '<' and illegal (nested) '<' occur, so we can make some
	97	// guess as to whether the input was actually marked up at all.
	98	// TODO: Think about how this interacts with EBCDIC charsets that are detected.
	99	if (fStripTags) {
	100	for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
	101	b = fRawInput[srci];
	102
	103	if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
	104	if (inMarkup) {
	105	badTags += 1;
	106	}
	107	inMarkup = TRUE;
	108	openTags += 1;
	109	}
	110
	111	if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
	112	if (inCSSDecl) {
	113	badTags += 1;
	114	}
	115	inCSSDecl = TRUE;
	116	openTags += 1;
	117	}
	118
	119	if (!inMarkup && !inCSSDecl) {
	120	fInputBytes[dsti++] = b;
	121	}
	122
	123	if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
	124	inMarkup = FALSE;
	125	}
	126	if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
	127	inCSSDecl = FALSE;
	128	}
	129	}
	130
	131	fInputLen = dsti;
	132	}
	133
	134	//
	135	// If it looks like this input wasn't marked up, or if it looks like it's
	136	// essentially nothing but markup abandon the markup stripping.
	137	// Detection will have to work on the unstripped input.
	138	//
	139	if (openTags<5 \|\| openTags/5 < badTags \|\|
	140	(fInputLen < 100 && fRawLength>600))
	141	{
	142	int32_t limit = fRawLength;
	143
	144	if (limit > BUFFER_SIZE) {
	145	limit = BUFFER_SIZE;
	146	}
	147
	148	for (srci=0; srci<limit; srci++) {
	149	fInputBytes[srci] = fRawInput[srci];
	150	}
	151
	152	fInputLen = srci;
	153	}
	154
	155	//
	156	// Tally up the byte occurence statistics.
	157	// These are available for use by the various detectors.
	158	//
	159
	160	uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
	161
	162	for (srci = 0; srci < fInputLen; srci += 1) {
	163	fByteStats[fInputBytes[srci]] += 1;
	164	}
	165
	166	for (int32_t i = 0x80; i <= 0x9F; i += 1) {
	167	if (fByteStats[i] != 0) {
	168	fC1Bytes = TRUE;
	169	break;
	170	}
	171	}
	172	}
	173
	174	U_NAMESPACE_END
	175	#endif
	176