git.saurik.com Git - apple/icu.git/blame - icuSources/i18n/csr2022.cpp

Commit	Line	Data
f3c0d7a5 A	1	// © 2016 and later: Unicode, Inc. and others.
f3c0d7a5 A	2	// License & terms of use: http://www.unicode.org/copyright.html
73c04bcf A	3	/*
73c04bcf A	4	**********************************************************************
2ca993e8	5	* Copyright (C) 2005-2016, International Business Machines
73c04bcf A	6	* Corporation and others. All Rights Reserved.
	7	**********************************************************************
	8	*/
	9
	10	#include "unicode/utypes.h"
	11
	12	#if !UCONFIG_NO_CONVERSION
	13
2ca993e8	14	#include "cmemory.h"
73c04bcf A	15	#include "cstring.h"
	16
	17	#include "csr2022.h"
51004dcb	18	#include "csmatch.h"
73c04bcf A	19
	20	U_NAMESPACE_BEGIN
	21
73c04bcf A	22	/**
73c04bcf A	23	* Matching function shared among the 2022 detectors JP, CN and KR
51004dcb	24	* Counts up the number of legal and unrecognized escape sequences in
73c04bcf A	25	* the sample of text, and computes a score based on the total number &
	26	* the proportion that fit the encoding.
	27	*
	28	*
	29	* @param text the byte buffer containing text to analyse
	30	* @param textLen the size of the text in the byte.
	31	* @param escapeSequences the byte escape sequences to test for.
	32	* @return match quality, in the range of 0-100.
	33	*/
51004dcb	34	int32_t CharsetRecog_2022::match_2022(const uint8_t *text, int32_t textLen, const uint8_t escapeSequences[][5], int32_t escapeSequences_length) const
73c04bcf A	35	{
	36	int32_t i, j;
	37	int32_t escN;
	38	int32_t hits = 0;
	39	int32_t misses = 0;
	40	int32_t shifts = 0;
	41	int32_t quality;
	42
	43	i = 0;
	44	while(i < textLen) {
	45	if(text[i] == 0x1B) {
	46	escN = 0;
	47	while(escN < escapeSequences_length) {
73c04bcf	48	const uint8_t *seq = escapeSequences[escN];
729e4ab9	49	int32_t seq_length = (int32_t)uprv_strlen((const char *) seq);
73c04bcf	50
46f4442e A	51	if (textLen-i >= seq_length) {
	52	j = 1;
	53	while(j < seq_length) {
	54	if(seq[j] != text[i+j]) {
	55	goto checkEscapes;
	56	}
	57
	58	j += 1;
73c04bcf A	59	}
73c04bcf A	60
46f4442e A	61	hits += 1;
	62	i += seq_length-1;
	63	goto scanInput;
73c04bcf	64	}
46f4442e	65	// else we ran out of string to compare this time.
73c04bcf A	66	checkEscapes:
	67	escN += 1;
	68	}
	69
	70	misses += 1;
	71	}
	72
	73	if( text[i]== 0x0e \|\| text[i] == 0x0f){
	74	shifts += 1;
	75	}
	76
	77	scanInput:
	78	i += 1;
	79	}
	80
	81	if (hits == 0) {
	82	return 0;
	83	}
	84
	85	//
	86	// Initial quality is based on relative proportion of recongized vs.
	87	// unrecognized escape sequences.
	88	// All good: quality = 100;
	89	// half or less good: quality = 0;
	90	// linear inbetween.
	91	quality = (100hits - 100misses) / (hits + misses);
	92
	93	// Back off quality if there were too few escape sequences seen.
	94	// Include shifts in this computation, so that KR does not get penalized
	95	// for having only a single Escape sequence, but many shifts.
	96	if (hits+shifts < 5) {
	97	quality -= (5-(hits+shifts))*10;
	98	}
	99
	100	if (quality < 0) {
	101	quality = 0;
	102	}
	103
	104	return quality;
	105	}
	106
	107
	108	static const uint8_t escapeSequences_2022JP[][5] = {
	109	{0x1b, 0x24, 0x28, 0x43, 0x00}, // KS X 1001:1992
	110	{0x1b, 0x24, 0x28, 0x44, 0x00}, // JIS X 212-1990
	111	{0x1b, 0x24, 0x40, 0x00, 0x00}, // JIS C 6226-1978
	112	{0x1b, 0x24, 0x41, 0x00, 0x00}, // GB 2312-80
	113	{0x1b, 0x24, 0x42, 0x00, 0x00}, // JIS X 208-1983
	114	{0x1b, 0x26, 0x40, 0x00, 0x00}, // JIS X 208 1990, 1997
	115	{0x1b, 0x28, 0x42, 0x00, 0x00}, // ASCII
	116	{0x1b, 0x28, 0x48, 0x00, 0x00}, // JIS-Roman
	117	{0x1b, 0x28, 0x49, 0x00, 0x00}, // Half-width katakana
	118	{0x1b, 0x28, 0x4a, 0x00, 0x00}, // JIS-Roman
	119	{0x1b, 0x2e, 0x41, 0x00, 0x00}, // ISO 8859-1
	120	{0x1b, 0x2e, 0x46, 0x00, 0x00} // ISO 8859-7
	121	};
	122
b331163b	123	#if !UCONFIG_ONLY_HTML_CONVERSION
73c04bcf A	124	static const uint8_t escapeSequences_2022KR[][5] = {
	125	{0x1b, 0x24, 0x29, 0x43, 0x00}
	126	};
	127
	128	static const uint8_t escapeSequences_2022CN[][5] = {
	129	{0x1b, 0x24, 0x29, 0x41, 0x00}, // GB 2312-80
	130	{0x1b, 0x24, 0x29, 0x47, 0x00}, // CNS 11643-1992 Plane 1
	131	{0x1b, 0x24, 0x2A, 0x48, 0x00}, // CNS 11643-1992 Plane 2
	132	{0x1b, 0x24, 0x29, 0x45, 0x00}, // ISO-IR-165
	133	{0x1b, 0x24, 0x2B, 0x49, 0x00}, // CNS 11643-1992 Plane 3
	134	{0x1b, 0x24, 0x2B, 0x4A, 0x00}, // CNS 11643-1992 Plane 4
	135	{0x1b, 0x24, 0x2B, 0x4B, 0x00}, // CNS 11643-1992 Plane 5
	136	{0x1b, 0x24, 0x2B, 0x4C, 0x00}, // CNS 11643-1992 Plane 6
	137	{0x1b, 0x24, 0x2B, 0x4D, 0x00}, // CNS 11643-1992 Plane 7
	138	{0x1b, 0x4e, 0x00, 0x00, 0x00}, // SS2
	139	{0x1b, 0x4f, 0x00, 0x00, 0x00}, // SS3
	140	};
b331163b	141	#endif
73c04bcf	142
4388f060 A	143	CharsetRecog_2022JP::~CharsetRecog_2022JP() {}
4388f060 A	144
51004dcb	145	const char *CharsetRecog_2022JP::getName() const {
73c04bcf A	146	return "ISO-2022-JP";
	147	}
	148
51004dcb A	149	UBool CharsetRecog_2022JP::match(InputText textIn, CharsetMatch results) const {
	150	int32_t confidence = match_2022(textIn->fInputBytes,
	151	textIn->fInputLen,
	152	escapeSequences_2022JP,
2ca993e8	153	UPRV_LENGTHOF(escapeSequences_2022JP));
51004dcb A	154	results->set(textIn, this, confidence);
51004dcb A	155	return (confidence > 0);
73c04bcf A	156	}
73c04bcf A	157
b331163b	158	#if !UCONFIG_ONLY_HTML_CONVERSION
4388f060 A	159	CharsetRecog_2022KR::~CharsetRecog_2022KR() {}
4388f060 A	160
51004dcb	161	const char *CharsetRecog_2022KR::getName() const {
73c04bcf A	162	return "ISO-2022-KR";
	163	}
	164
51004dcb A	165	UBool CharsetRecog_2022KR::match(InputText textIn, CharsetMatch results) const {
	166	int32_t confidence = match_2022(textIn->fInputBytes,
	167	textIn->fInputLen,
	168	escapeSequences_2022KR,
2ca993e8	169	UPRV_LENGTHOF(escapeSequences_2022KR));
51004dcb A	170	results->set(textIn, this, confidence);
51004dcb A	171	return (confidence > 0);
73c04bcf A	172	}
73c04bcf A	173
4388f060 A	174	CharsetRecog_2022CN::~CharsetRecog_2022CN() {}
4388f060 A	175
51004dcb	176	const char *CharsetRecog_2022CN::getName() const {
73c04bcf A	177	return "ISO-2022-CN";
	178	}
	179
51004dcb A	180	UBool CharsetRecog_2022CN::match(InputText textIn, CharsetMatch results) const {
	181	int32_t confidence = match_2022(textIn->fInputBytes,
	182	textIn->fInputLen,
	183	escapeSequences_2022CN,
2ca993e8	184	UPRV_LENGTHOF(escapeSequences_2022CN));
51004dcb A	185	results->set(textIn, this, confidence);
51004dcb A	186	return (confidence > 0);
73c04bcf	187	}
b331163b	188	#endif
73c04bcf	189
51004dcb	190	CharsetRecog_2022::~CharsetRecog_2022() {
73c04bcf A	191	// nothing to do
	192	}
	193
	194	U_NAMESPACE_END
	195	#endif