git.saurik.com Git - apple/icu.git/blame_incremental

... / ...

Commit	Line	Data
	1	// © 2016 and later: Unicode, Inc. and others.
	2	// License & terms of use: http://www.unicode.org/copyright.html
	3	/*
	4	**********************************************************************
	5	* Copyright (C) 2005-2016, International Business Machines
	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_CONVERSION
	13
	14	#include "unicode/ucsdet.h"
	15
	16	#include "csdetect.h"
	17	#include "csmatch.h"
	18	#include "uenumimp.h"
	19
	20	#include "cmemory.h"
	21	#include "cstring.h"
	22	#include "umutex.h"
	23	#include "ucln_in.h"
	24	#include "uarrsort.h"
	25	#include "inputext.h"
	26	#include "csrsbcs.h"
	27	#include "csrmbcs.h"
	28	#include "csrutf8.h"
	29	#include "csrucode.h"
	30	#include "csr2022.h"
	31
	32	#define NEW_ARRAY(type,count) (type ) uprv_malloc((count) sizeof(type))
	33	#define DELETE_ARRAY(array) uprv_free((void *) (array))
	34
	35	U_NAMESPACE_BEGIN
	36
	37	struct CSRecognizerInfo : public UMemory {
	38	CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
	39	: recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}
	40
	41	~CSRecognizerInfo() {delete recognizer;}
	42
	43	CharsetRecognizer *recognizer;
	44	UBool isDefaultEnabled;
	45	};
	46
	47	U_NAMESPACE_END
	48
	49	static icu::CSRecognizerInfo **fCSRecognizers = NULL;
	50	static icu::UInitOnce gCSRecognizersInitOnce = U_INITONCE_INITIALIZER;
	51	static int32_t fCSRecognizers_size = 0;
	52
	53	U_CDECL_BEGIN
	54	static UBool U_CALLCONV csdet_cleanup(void)
	55	{
	56	U_NAMESPACE_USE
	57	if (fCSRecognizers != NULL) {
	58	for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
	59	delete fCSRecognizers[r];
	60	fCSRecognizers[r] = NULL;
	61	}
	62
	63	DELETE_ARRAY(fCSRecognizers);
	64	fCSRecognizers = NULL;
	65	fCSRecognizers_size = 0;
	66	}
	67	gCSRecognizersInitOnce.reset();
	68
	69	return TRUE;
	70	}
	71
	72	static int32_t U_CALLCONV
	73	charsetMatchComparator(const void * /context/, const void left, const void right)
	74	{
	75	U_NAMESPACE_USE
	76
	77	const CharsetMatch csm_l = (const CharsetMatch ) left;
	78	const CharsetMatch csm_r = (const CharsetMatch ) right;
	79
	80	// NOTE: compare is backwards to sort from highest to lowest.
	81	return (csm_r)->getConfidence() - (csm_l)->getConfidence();
	82	}
	83
	84	static void U_CALLCONV initRecognizers(UErrorCode &status) {
	85	U_NAMESPACE_USE
	86	ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
	87	CSRecognizerInfo *tempArray[] = {
	88	new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
	89
	90	new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
	91	new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
	92	new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
	93	new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
	94
	95	new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
	96	new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
	97	new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
	98	new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
	99	new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
	100	new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
	101	new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
	102	new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
	103	new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
	104	new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
	105	new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
	106	new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
	107	new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
	108	new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
	109	new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
	110	new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
	111
	112	new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
	113	#if !UCONFIG_ONLY_HTML_CONVERSION
	114	new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
	115	new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
	116
	117	new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
	118	new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
	119	new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
	120	new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
	121	#endif
	122	};
	123	int32_t rCount = UPRV_LENGTHOF(tempArray);
	124
	125	fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
	126
	127	if (fCSRecognizers == NULL) {
	128	status = U_MEMORY_ALLOCATION_ERROR;
	129	}
	130	else {
	131	fCSRecognizers_size = rCount;
	132	for (int32_t r = 0; r < rCount; r += 1) {
	133	fCSRecognizers[r] = tempArray[r];
	134	if (fCSRecognizers[r] == NULL) {
	135	status = U_MEMORY_ALLOCATION_ERROR;
	136	}
	137	}
	138	}
	139	}
	140
	141	U_CDECL_END
	142
	143	U_NAMESPACE_BEGIN
	144
	145	void CharsetDetector::setRecognizers(UErrorCode &status)
	146	{
	147	umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
	148	}
	149
	150	CharsetDetector::CharsetDetector(UErrorCode &status)
	151	: textIn(new InputText(status)), resultArray(NULL),
	152	resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
	153	fEnabledRecognizers(NULL)
	154	{
	155	if (U_FAILURE(status)) {
	156	return;
	157	}
	158
	159	setRecognizers(status);
	160
	161	if (U_FAILURE(status)) {
	162	return;
	163	}
	164
	165	resultArray = (CharsetMatch *)uprv_malloc(sizeof(CharsetMatch )*fCSRecognizers_size);
	166
	167	if (resultArray == NULL) {
	168	status = U_MEMORY_ALLOCATION_ERROR;
	169	return;
	170	}
	171
	172	for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
	173	resultArray[i] = new CharsetMatch();
	174
	175	if (resultArray[i] == NULL) {
	176	status = U_MEMORY_ALLOCATION_ERROR;
	177	break;
	178	}
	179	}
	180	}
	181
	182	CharsetDetector::~CharsetDetector()
	183	{
	184	delete textIn;
	185
	186	for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
	187	delete resultArray[i];
	188	}
	189
	190	uprv_free(resultArray);
	191
	192	if (fEnabledRecognizers) {
	193	uprv_free(fEnabledRecognizers);
	194	}
	195	}
	196
	197	void CharsetDetector::setText(const char *in, int32_t len)
	198	{
	199	textIn->setText(in, len);
	200	fFreshTextSet = TRUE;
	201	}
	202
	203	UBool CharsetDetector::setStripTagsFlag(UBool flag)
	204	{
	205	UBool temp = fStripTags;
	206	fStripTags = flag;
	207	fFreshTextSet = TRUE;
	208	return temp;
	209	}
	210
	211	UBool CharsetDetector::getStripTagsFlag() const
	212	{
	213	return fStripTags;
	214	}
	215
	216	void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
	217	{
	218	textIn->setDeclaredEncoding(encoding,len);
	219	}
	220
	221	int32_t CharsetDetector::getDetectableCount()
	222	{
	223	UErrorCode status = U_ZERO_ERROR;
	224
	225	setRecognizers(status);
	226
	227	return fCSRecognizers_size;
	228	}
	229
	230	const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
	231	{
	232	int32_t maxMatchesFound = 0;
	233
	234	detectAll(maxMatchesFound, status);
	235
	236	if(maxMatchesFound > 0) {
	237	return resultArray[0];
	238	} else {
	239	return NULL;
	240	}
	241	}
	242
	243	const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
	244	{
	245	if(!textIn->isSet()) {
	246	status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set
	247
	248	return NULL;
	249	} else if (fFreshTextSet) {
	250	CharsetRecognizer *csr;
	251	int32_t i;
	252
	253	textIn->MungeInput(fStripTags);
	254
	255	// Iterate over all possible charsets, remember all that
	256	// give a match quality > 0.
	257	resultCount = 0;
	258	for (i = 0; i < fCSRecognizers_size; i += 1) {
	259	csr = fCSRecognizers[i]->recognizer;
	260	if (csr->match(textIn, resultArray[resultCount])) {
	261	resultCount++;
	262	}
	263	}
	264
	265	if (resultCount > 1) {
	266	uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
	267	}
	268	fFreshTextSet = FALSE;
	269	}
	270
	271	maxMatchesFound = resultCount;
	272
	273	return resultArray;
	274	}
	275
	276	void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
	277	{
	278	if (U_FAILURE(status)) {
	279	return;
	280	}
	281
	282	int32_t modIdx = -1;
	283	UBool isDefaultVal = FALSE;
	284	for (int32_t i = 0; i < fCSRecognizers_size; i++) {
	285	CSRecognizerInfo *csrinfo = fCSRecognizers[i];
	286	if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
	287	modIdx = i;
	288	isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
	289	break;
	290	}
	291	}
	292	if (modIdx < 0) {
	293	// No matching encoding found
	294	status = U_ILLEGAL_ARGUMENT_ERROR;
	295	return;
	296	}
	297
	298	if (fEnabledRecognizers == NULL && !isDefaultVal) {
	299	// Create an array storing the non default setting
	300	fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
	301	if (fEnabledRecognizers == NULL) {
	302	status = U_MEMORY_ALLOCATION_ERROR;
	303	return;
	304	}
	305	// Initialize the array with default info
	306	for (int32_t i = 0; i < fCSRecognizers_size; i++) {
	307	fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
	308	}
	309	}
	310
	311	if (fEnabledRecognizers != NULL) {
	312	fEnabledRecognizers[modIdx] = enabled;
	313	}
	314	}
	315
	316	/const char CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
	317	{
	318	if( index > fCSRecognizers_size-1 \|\| index < 0) {
	319	status = U_INDEX_OUTOFBOUNDS_ERROR;
	320
	321	return 0;
	322	} else {
	323	return fCSRecognizers[index]->getName();
	324	}
	325	}*/
	326
	327	U_NAMESPACE_END
	328
	329	U_CDECL_BEGIN
	330	typedef struct {
	331	int32_t currIndex;
	332	UBool all;
	333	UBool *enabledRecognizers;
	334	} Context;
	335
	336
	337
	338	static void U_CALLCONV
	339	enumClose(UEnumeration *en) {
	340	if(en->context != NULL) {
	341	DELETE_ARRAY(en->context);
	342	}
	343
	344	DELETE_ARRAY(en);
	345	}
	346
	347	static int32_t U_CALLCONV
	348	enumCount(UEnumeration en, UErrorCode ) {
	349	if (((Context *)en->context)->all) {
	350	// ucsdet_getAllDetectableCharsets, all charset detector names
	351	return fCSRecognizers_size;
	352	}
	353
	354	// Otherwise, ucsdet_getDetectableCharsets - only enabled ones
	355	int32_t count = 0;
	356	UBool enabledArray = ((Context )en->context)->enabledRecognizers;
	357	if (enabledArray != NULL) {
	358	// custom set
	359	for (int32_t i = 0; i < fCSRecognizers_size; i++) {
	360	if (enabledArray[i]) {
	361	count++;
	362	}
	363	}
	364	} else {
	365	// default set
	366	for (int32_t i = 0; i < fCSRecognizers_size; i++) {
	367	if (fCSRecognizers[i]->isDefaultEnabled) {
	368	count++;
	369	}
	370	}
	371	}
	372	return count;
	373	}
	374
	375	static const char* U_CALLCONV
	376	enumNext(UEnumeration en, int32_t resultLength, UErrorCode * /status/) {
	377	const char *currName = NULL;
	378
	379	if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
	380	if (((Context *)en->context)->all) {
	381	// ucsdet_getAllDetectableCharsets, all charset detector names
	382	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
	383	((Context *)en->context)->currIndex++;
	384	} else {
	385	// ucsdet_getDetectableCharsets
	386	UBool enabledArray = ((Context )en->context)->enabledRecognizers;
	387	if (enabledArray != NULL) {
	388	// custome set
	389	while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
	390	if (enabledArray[((Context *)en->context)->currIndex]) {
	391	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
	392	}
	393	((Context *)en->context)->currIndex++;
	394	}
	395	} else {
	396	// default set
	397	while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
	398	if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
	399	currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
	400	}
	401	((Context *)en->context)->currIndex++;
	402	}
	403	}
	404	}
	405	}
	406
	407	if(resultLength != NULL) {
	408	*resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
	409	}
	410
	411	return currName;
	412	}
	413
	414
	415	static void U_CALLCONV
	416	enumReset(UEnumeration en, UErrorCode ) {
	417	((Context *)en->context)->currIndex = 0;
	418	}
	419
	420	static const UEnumeration gCSDetEnumeration = {
	421	NULL,
	422	NULL,
	423	enumClose,
	424	enumCount,
	425	uenum_unextDefault,
	426	enumNext,
	427	enumReset
	428	};
	429
	430	U_CDECL_END
	431
	432	U_NAMESPACE_BEGIN
	433
	434	UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
	435	{
	436
	437	/* Initialize recognized charsets. */
	438	setRecognizers(status);
	439
	440	if(U_FAILURE(status)) {
	441	return 0;
	442	}
	443
	444	UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
	445	if (en == NULL) {
	446	status = U_MEMORY_ALLOCATION_ERROR;
	447	return 0;
	448	}
	449	memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
	450	en->context = (void*)NEW_ARRAY(Context, 1);
	451	if (en->context == NULL) {
	452	status = U_MEMORY_ALLOCATION_ERROR;
	453	DELETE_ARRAY(en);
	454	return 0;
	455	}
	456	uprv_memset(en->context, 0, sizeof(Context));
	457	((Context*)en->context)->all = TRUE;
	458	return en;
	459	}
	460
	461	UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
	462	{
	463	if(U_FAILURE(status)) {
	464	return 0;
	465	}
	466
	467	UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
	468	if (en == NULL) {
	469	status = U_MEMORY_ALLOCATION_ERROR;
	470	return 0;
	471	}
	472	memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
	473	en->context = (void*)NEW_ARRAY(Context, 1);
	474	if (en->context == NULL) {
	475	status = U_MEMORY_ALLOCATION_ERROR;
	476	DELETE_ARRAY(en);
	477	return 0;
	478	}
	479	uprv_memset(en->context, 0, sizeof(Context));
	480	((Context*)en->context)->all = FALSE;
	481	((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
	482	return en;
	483	}
	484
	485	U_NAMESPACE_END
	486
	487	#endif