2 **********************************************************************
3 * Copyright (C) 2000-2012, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvisci.c
8 * tab size: 8 (not used)
11 * created on: 2001JUN26
12 * created by: Ram Viswanadha
14 * Date Name Description
15 * 24/7/2001 Ram Added support for EXT character handling
18 #include "unicode/utypes.h"
20 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
22 #include "unicode/ucnv.h"
23 #include "unicode/ucnv_cb.h"
24 #include "unicode/utf16.h"
31 #define UCNV_OPTIONS_VERSION_MASK 0xf
34 #define ZWNJ 0x200c /* Zero Width Non Joiner */
35 #define ZWJ 0x200d /* Zero width Joiner */
36 #define INVALID_CHAR 0xffff
37 #define ATR 0xEF /* Attribute code */
38 #define EXT 0xF0 /* Extension code */
40 #define DOUBLE_DANDA 0x0965
41 #define ISCII_NUKTA 0xE9
42 #define ISCII_HALANT 0xE8
43 #define ISCII_DANDA 0xEA
44 #define ISCII_INV 0xD9
45 #define ISCII_VOWEL_SIGN_E 0xE0
46 #define INDIC_BLOCK_BEGIN 0x0900
47 #define INDIC_BLOCK_END 0x0D7F
48 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
49 #define VOCALLIC_RR 0x0931
51 #define ASCII_END 0xA0
52 #define NO_CHAR_MARKER 0xFFFE
53 #define TELUGU_DELTA DELTA * TELUGU
54 #define DEV_ABBR_SIGN 0x0970
55 #define DEV_ANUDATTA 0x0952
56 #define EXT_RANGE_BEGIN 0xA1
57 #define EXT_RANGE_END 0xEE
59 #define PNJ_DELTA 0x0100
60 #define PNJ_BINDI 0x0A02
61 #define PNJ_TIPPI 0x0A70
62 #define PNJ_SIGN_VIRAMA 0x0A4D
63 #define PNJ_ADHAK 0x0A71
65 #define PNJ_RRA 0x0A5C
81 * Enumeration for switching code pages if <ATR>+<one of below values>
117 #define ISCII_CNV_PREFIX "ISCII,version="
120 UChar contextCharToUnicode
; /* previous Unicode codepoint for contextual analysis */
121 UChar contextCharFromUnicode
; /* previous Unicode codepoint for contextual analysis */
122 uint16_t defDeltaToUnicode
; /* delta for switching to default state when DEF is encountered */
123 uint16_t currentDeltaFromUnicode
; /* current delta in Indic block */
124 uint16_t currentDeltaToUnicode
; /* current delta in Indic block */
125 MaskEnum currentMaskFromUnicode
; /* mask for current state in toUnicode */
126 MaskEnum currentMaskToUnicode
; /* mask for current state in toUnicode */
127 MaskEnum defMaskToUnicode
; /* mask for default state in toUnicode */
128 UBool isFirstBuffer
; /* boolean for fromUnicode to see if we need to announce the first script */
129 UBool resetToDefaultToUnicode
; /* boolean for reseting to default delta and mask when a newline is encountered*/
130 char name
[sizeof(ISCII_CNV_PREFIX
) + 1];
131 UChar32 prevToUnicodeStatus
; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
132 } UConverterDataISCII
;
134 typedef struct LookupDataStruct
{
140 static const LookupDataStruct lookupInitialData
[]={
141 { DEVANAGARI
, DEV_MASK
, DEV
},
142 { BENGALI
, BNG_MASK
, BNG
},
143 { GURMUKHI
, PNJ_MASK
, PNJ
},
144 { GUJARATI
, GJR_MASK
, GJR
},
145 { ORIYA
, ORI_MASK
, ORI
},
146 { TAMIL
, TML_MASK
, TML
},
147 { TELUGU
, KND_MASK
, TLG
},
148 { KANNADA
, KND_MASK
, KND
},
149 { MALAYALAM
, MLM_MASK
, MLM
}
153 * For special handling of certain Gurmukhi characters.
154 * Bit 0 (value 1): PNJ consonant
155 * Bit 1 (value 2): PNJ Bindi Tippi
157 static const uint8_t pnjMap
[80] = {
159 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
163 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3,
165 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2,
167 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
171 isPNJConsonant(UChar32 c
) {
172 if (c
< 0xa00 || 0xa50 <= c
) {
175 return (UBool
)(pnjMap
[c
- 0xa00] & 1);
180 isPNJBindiTippi(UChar32 c
) {
181 if (c
< 0xa00 || 0xa50 <= c
) {
184 return (UBool
)(pnjMap
[c
- 0xa00] >> 1);
188 static void _ISCIIOpen(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
) {
189 if(pArgs
->onlyTestIsLoadable
) {
193 cnv
->extraInfo
= uprv_malloc(sizeof(UConverterDataISCII
));
195 if (cnv
->extraInfo
!= NULL
) {
197 UConverterDataISCII
*converterData
=
198 (UConverterDataISCII
*) cnv
->extraInfo
;
199 converterData
->contextCharToUnicode
=NO_CHAR_MARKER
;
200 cnv
->toUnicodeStatus
= missingCharMarker
;
201 converterData
->contextCharFromUnicode
=0x0000;
202 converterData
->resetToDefaultToUnicode
=FALSE
;
203 /* check if the version requested is supported */
204 if ((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) < 9) {
205 /* initialize state variables */
206 converterData
->currentDeltaFromUnicode
207 = converterData
->currentDeltaToUnicode
208 = converterData
->defDeltaToUnicode
= (uint16_t)(lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].uniLang
* DELTA
);
210 converterData
->currentMaskFromUnicode
211 = converterData
->currentMaskToUnicode
212 = converterData
->defMaskToUnicode
= lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].maskEnum
;
214 converterData
->isFirstBuffer
=TRUE
;
215 (void)uprv_strcpy(converterData
->name
, ISCII_CNV_PREFIX
);
216 len
= (int32_t)uprv_strlen(converterData
->name
);
217 converterData
->name
[len
]= (char)((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) + '0');
218 converterData
->name
[len
+1]=0;
220 converterData
->prevToUnicodeStatus
= 0x0000;
222 uprv_free(cnv
->extraInfo
);
223 cnv
->extraInfo
= NULL
;
224 *errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
228 *errorCode
=U_MEMORY_ALLOCATION_ERROR
;
232 static void _ISCIIClose(UConverter
*cnv
) {
233 if (cnv
->extraInfo
!=NULL
) {
234 if (!cnv
->isExtraLocal
) {
235 uprv_free(cnv
->extraInfo
);
241 static const char* _ISCIIgetName(const UConverter
* cnv
) {
242 if (cnv
->extraInfo
) {
243 UConverterDataISCII
* myData
= (UConverterDataISCII
*)cnv
->extraInfo
;
249 static void _ISCIIReset(UConverter
*cnv
, UConverterResetChoice choice
) {
250 UConverterDataISCII
* data
=(UConverterDataISCII
*) (cnv
->extraInfo
);
251 if (choice
<=UCNV_RESET_TO_UNICODE
) {
252 cnv
->toUnicodeStatus
= missingCharMarker
;
254 data
->currentDeltaToUnicode
=data
->defDeltaToUnicode
;
255 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
256 data
->contextCharToUnicode
=NO_CHAR_MARKER
;
257 data
->prevToUnicodeStatus
= 0x0000;
259 if (choice
!=UCNV_RESET_TO_UNICODE
) {
260 cnv
->fromUChar32
=0x0000;
261 data
->contextCharFromUnicode
=0x00;
262 data
->currentMaskFromUnicode
=data
->defMaskToUnicode
;
263 data
->currentDeltaFromUnicode
=data
->defDeltaToUnicode
;
264 data
->isFirstBuffer
=TRUE
;
265 data
->resetToDefaultToUnicode
=FALSE
;
270 * The values in validity table are indexed by the lower bits of Unicode
271 * range 0x0900 - 0x09ff. The values have a structure like:
272 * ---------------------------------------------------------------
273 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
274 * | | | | | ASM | KND | | |
275 * ---------------------------------------------------------------
276 * If a code point is valid in a particular script
277 * then that bit is turned on
279 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
280 * to represent these languages
282 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
283 * and combine and use 1 bit to represent these languages.
285 * TODO: It is probably easier to understand and maintain to change this
286 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
289 static const uint8_t validityTable
[128] = {
290 /* This state table is tool generated please do not edit unless you know exactly what you are doing */
291 /* Note: This table was edited to mirror the Windows XP implementation */
292 /*ISCII:Valid:Unicode */
293 /*0xa0 : 0x00: 0x900 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
294 /*0xa1 : 0xb8: 0x901 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
295 /*0xa2 : 0xfe: 0x902 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
296 /*0xa3 : 0xbf: 0x903 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
297 /*0x00 : 0x00: 0x904 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
298 /*0xa4 : 0xff: 0x905 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
299 /*0xa5 : 0xff: 0x906 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
300 /*0xa6 : 0xff: 0x907 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
301 /*0xa7 : 0xff: 0x908 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
302 /*0xa8 : 0xff: 0x909 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
303 /*0xa9 : 0xff: 0x90a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
304 /*0xaa : 0xfe: 0x90b */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
305 /*0x00 : 0x00: 0x90c */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
306 /*0xae : 0x80: 0x90d */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
307 /*0xab : 0x87: 0x90e */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
308 /*0xac : 0xff: 0x90f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
309 /*0xad : 0xff: 0x910 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
310 /*0xb2 : 0x80: 0x911 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
311 /*0xaf : 0x87: 0x912 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
312 /*0xb0 : 0xff: 0x913 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
313 /*0xb1 : 0xff: 0x914 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
314 /*0xb3 : 0xff: 0x915 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
315 /*0xb4 : 0xfe: 0x916 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
316 /*0xb5 : 0xfe: 0x917 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
317 /*0xb6 : 0xfe: 0x918 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
318 /*0xb7 : 0xff: 0x919 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
319 /*0xb8 : 0xff: 0x91a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
320 /*0xb9 : 0xfe: 0x91b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
321 /*0xba : 0xff: 0x91c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
322 /*0xbb : 0xfe: 0x91d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
323 /*0xbc : 0xff: 0x91e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
324 /*0xbd : 0xff: 0x91f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
325 /*0xbe : 0xfe: 0x920 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
326 /*0xbf : 0xfe: 0x921 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
327 /*0xc0 : 0xfe: 0x922 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
328 /*0xc1 : 0xff: 0x923 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
329 /*0xc2 : 0xff: 0x924 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
330 /*0xc3 : 0xfe: 0x925 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
331 /*0xc4 : 0xfe: 0x926 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
332 /*0xc5 : 0xfe: 0x927 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
333 /*0xc6 : 0xff: 0x928 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
334 /*0xc7 : 0x81: 0x929 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ TML_MASK
,
335 /*0xc8 : 0xff: 0x92a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
336 /*0xc9 : 0xfe: 0x92b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
337 /*0xca : 0xfe: 0x92c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
338 /*0xcb : 0xfe: 0x92d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
339 /*0xcc : 0xfe: 0x92e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
340 /*0xcd : 0xff: 0x92f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
341 /*0xcf : 0xff: 0x930 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
342 /*0xd0 : 0x87: 0x931 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
343 /*0xd1 : 0xff: 0x932 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
344 /*0xd2 : 0xb7: 0x933 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
345 /*0xd3 : 0x83: 0x934 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
346 /*0xd4 : 0xff: 0x935 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
347 /*0xd5 : 0xfe: 0x936 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
348 /*0xd6 : 0xbf: 0x937 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
349 /*0xd7 : 0xff: 0x938 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
350 /*0xd8 : 0xff: 0x939 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
351 /*0x00 : 0x00: 0x93A */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
352 /*0x00 : 0x00: 0x93B */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
353 /*0xe9 : 0xda: 0x93c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
354 /*0x00 : 0x00: 0x93d */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
355 /*0xda : 0xff: 0x93e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
356 /*0xdb : 0xff: 0x93f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
357 /*0xdc : 0xff: 0x940 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
358 /*0xdd : 0xff: 0x941 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
359 /*0xde : 0xff: 0x942 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
360 /*0xdf : 0xbe: 0x943 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
361 /*0x00 : 0x00: 0x944 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ BNG_MASK
+ KND_MASK
+ ZERO
+ ZERO
,
362 /*0xe3 : 0x80: 0x945 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
363 /*0xe0 : 0x87: 0x946 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
364 /*0xe1 : 0xff: 0x947 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
365 /*0xe2 : 0xff: 0x948 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
366 /*0xe7 : 0x80: 0x949 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
367 /*0xe4 : 0x87: 0x94a */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
368 /*0xe5 : 0xff: 0x94b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
369 /*0xe6 : 0xff: 0x94c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
370 /*0xe8 : 0xff: 0x94d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
371 /*0xec : 0x00: 0x94e */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
372 /*0xed : 0x00: 0x94f */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
373 /*0x00 : 0x00: 0x950 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
374 /*0x00 : 0x00: 0x951 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
375 /*0x00 : 0x00: 0x952 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
376 /*0x00 : 0x00: 0x953 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
377 /*0x00 : 0x00: 0x954 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
378 /*0x00 : 0x00: 0x955 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
379 /*0x00 : 0x00: 0x956 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
380 /*0x00 : 0x00: 0x957 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ MLM_MASK
+ ZERO
,
381 /*0x00 : 0x00: 0x958 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
382 /*0x00 : 0x00: 0x959 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
383 /*0x00 : 0x00: 0x95a */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
384 /*0x00 : 0x00: 0x95b */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
385 /*0x00 : 0x00: 0x95c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
386 /*0x00 : 0x00: 0x95d */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
387 /*0x00 : 0x00: 0x95e */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
388 /*0xce : 0x98: 0x95f */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
389 /*0x00 : 0x00: 0x960 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
390 /*0x00 : 0x00: 0x961 */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
391 /*0x00 : 0x00: 0x962 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
392 /*0x00 : 0x00: 0x963 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
393 /*0xea : 0xf8: 0x964 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
394 /*0xeaea : 0x00: 0x965*/ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
395 /*0xf1 : 0xff: 0x966 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
396 /*0xf2 : 0xff: 0x967 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
397 /*0xf3 : 0xff: 0x968 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
398 /*0xf4 : 0xff: 0x969 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
399 /*0xf5 : 0xff: 0x96a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
400 /*0xf6 : 0xff: 0x96b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
401 /*0xf7 : 0xff: 0x96c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
402 /*0xf8 : 0xff: 0x96d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
403 /*0xf9 : 0xff: 0x96e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
404 /*0xfa : 0xff: 0x96f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
405 /*0x00 : 0x80: 0x970 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
407 * The length of the array is 128 to provide values for 0x900..0x97f.
408 * The last 15 entries for 0x971..0x97f of the validity table are all zero
409 * because no Indic script uses such Unicode code points.
411 /*0x00 : 0x00: 0x9yz */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
414 static const uint16_t fromUnicodeTable
[128]={
495 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
544 static const uint16_t toUnicodeTable
[256]={
803 static const uint16_t vowelSignESpecialCases
[][2]={
804 { 2 /*length of array*/ , 0 },
808 static const uint16_t nuktaSpecialCases
[][2]={
809 { 16 /*length of array*/ , 0 },
828 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
829 int32_t offset = (int32_t)(source - args->source-1); \
830 /* write the targetUniChar to target */ \
831 if(target < targetLimit){ \
832 if(targetByteUnit <= 0xFF){ \
833 *(target)++ = (uint8_t)(targetByteUnit); \
835 *(offsets++) = offset; \
838 if (targetByteUnit > 0xFFFF) { \
839 *(target)++ = (uint8_t)(targetByteUnit>>16); \
842 *(offsets++) = offset; \
845 if (!(target < targetLimit)) { \
846 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
847 (uint8_t)(targetByteUnit >> 8); \
848 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
849 (uint8_t)targetByteUnit; \
850 *err = U_BUFFER_OVERFLOW_ERROR; \
852 *(target)++ = (uint8_t)(targetByteUnit>>8); \
854 *(offsets++) = offset; \
856 if(target < targetLimit){ \
857 *(target)++ = (uint8_t) targetByteUnit; \
859 *(offsets++) = offset ; \
862 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
863 (uint8_t) (targetByteUnit); \
864 *err = U_BUFFER_OVERFLOW_ERROR; \
869 if (targetByteUnit & 0xFF0000) { \
870 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
871 (uint8_t) (targetByteUnit >>16); \
873 if(targetByteUnit & 0xFF00){ \
874 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
875 (uint8_t) (targetByteUnit >>8); \
877 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
878 (uint8_t) (targetByteUnit); \
879 *err = U_BUFFER_OVERFLOW_ERROR; \
890 static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
891 UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
892 const UChar
*source
= args
->source
;
893 const UChar
*sourceLimit
= args
->sourceLimit
;
894 unsigned char *target
= (unsigned char *) args
->target
;
895 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
896 int32_t* offsets
= args
->offsets
;
897 uint32_t targetByteUnit
= 0x0000;
898 UChar32 sourceChar
= 0x0000;
899 UChar32 tempContextFromUnicode
= 0x0000; /* For special handling of the Gurmukhi script. */
900 UConverterDataISCII
*converterData
;
903 UBool deltaChanged
= FALSE
;
905 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)) {
906 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
909 /* initialize data */
910 converterData
=(UConverterDataISCII
*)args
->converter
->extraInfo
;
911 newDelta
=converterData
->currentDeltaFromUnicode
;
912 range
= (uint16_t)(newDelta
/DELTA
);
914 if ((sourceChar
= args
->converter
->fromUChar32
)!=0) {
918 /*writing the char to the output stream */
919 while (source
< sourceLimit
) {
920 /* Write the language code following LF only if LF is not the last character. */
921 if (args
->converter
->fromUnicodeStatus
== LF
) {
922 targetByteUnit
= ATR
<<8;
923 targetByteUnit
+= (uint8_t) lookupInitialData
[range
].isciiLang
;
924 args
->converter
->fromUnicodeStatus
= 0x0000;
925 /* now append ATR and language code */
926 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
927 if (U_FAILURE(*err
)) {
932 sourceChar
= *source
++;
933 tempContextFromUnicode
= converterData
->contextCharFromUnicode
;
935 targetByteUnit
= missingCharMarker
;
937 /*check if input is in ASCII and C0 control codes range*/
938 if (sourceChar
<= ASCII_END
) {
939 args
->converter
->fromUnicodeStatus
= sourceChar
;
940 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,sourceChar
,err
);
941 if (U_FAILURE(*err
)) {
946 switch (sourceChar
) {
948 /* contextChar has HALANT */
949 if (converterData
->contextCharFromUnicode
) {
950 converterData
->contextCharFromUnicode
= 0x00;
951 targetByteUnit
= ISCII_HALANT
;
953 /* consume ZWNJ and continue */
954 converterData
->contextCharFromUnicode
= 0x00;
959 /* contextChar has HALANT */
960 if (converterData
->contextCharFromUnicode
) {
961 targetByteUnit
= ISCII_NUKTA
;
963 targetByteUnit
=ISCII_INV
;
965 converterData
->contextCharFromUnicode
= 0x00;
968 /* is the sourceChar in the INDIC_RANGE? */
969 if ((uint16_t)(INDIC_BLOCK_END
-sourceChar
) <= INDIC_RANGE
) {
970 /* Danda and Double Danda are valid in Northern scripts.. since Unicode
971 * does not include these codepoints in all Northern scrips we need to
974 if (sourceChar
!= DANDA
&& sourceChar
!= DOUBLE_DANDA
) {
975 /* find out to which block the souceChar belongs*/
976 range
=(uint16_t)((sourceChar
-INDIC_BLOCK_BEGIN
)/DELTA
);
977 newDelta
=(uint16_t)(range
*DELTA
);
979 /* Now are we in the same block as the previous? */
980 if (newDelta
!= converterData
->currentDeltaFromUnicode
|| converterData
->isFirstBuffer
) {
981 converterData
->currentDeltaFromUnicode
= newDelta
;
982 converterData
->currentMaskFromUnicode
= lookupInitialData
[range
].maskEnum
;
984 converterData
->isFirstBuffer
=FALSE
;
987 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
) {
988 if (sourceChar
== PNJ_TIPPI
) {
989 /* Make sure Tippi is converterd to Bindi. */
990 sourceChar
= PNJ_BINDI
;
991 } else if (sourceChar
== PNJ_ADHAK
) {
992 /* This is for consonant cluster handling. */
993 converterData
->contextCharFromUnicode
= PNJ_ADHAK
;
997 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
998 /* now subtract the new delta from sourceChar*/
999 sourceChar
-= converterData
->currentDeltaFromUnicode
;
1002 /* get the target byte unit */
1003 targetByteUnit
=fromUnicodeTable
[(uint8_t)sourceChar
];
1005 /* is the code point valid in current script? */
1006 if ((validityTable
[(uint8_t)sourceChar
] & converterData
->currentMaskFromUnicode
)==0) {
1007 /* Vocallic RR is assigned in ISCII Telugu and Unicode */
1008 if (converterData
->currentDeltaFromUnicode
!=(TELUGU_DELTA
) || sourceChar
!=VOCALLIC_RR
) {
1009 targetByteUnit
=missingCharMarker
;
1014 /* we are in a script block which is different than
1015 * previous sourceChar's script block write ATR and language codes
1018 temp
=(uint16_t)(ATR
<<8);
1019 temp
+= (uint16_t)((uint8_t) lookupInitialData
[range
].isciiLang
);
1022 /* now append ATR and language code */
1023 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,temp
,err
);
1024 if (U_FAILURE(*err
)) {
1029 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& (sourceChar
+ PNJ_DELTA
) == PNJ_ADHAK
) {
1033 /* reset context char */
1034 converterData
->contextCharFromUnicode
= 0x00;
1037 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& tempContextFromUnicode
== PNJ_ADHAK
&& isPNJConsonant((sourceChar
+ PNJ_DELTA
))) {
1038 /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
1039 /* reset context char */
1040 converterData
->contextCharFromUnicode
= 0x0000;
1041 targetByteUnit
= targetByteUnit
<< 16 | ISCII_HALANT
<< 8 | targetByteUnit
;
1042 /* write targetByteUnit to target */
1043 WRITE_TO_TARGET_FROM_U(args
, offsets
, source
, target
, targetLimit
, targetByteUnit
,err
);
1044 if (U_FAILURE(*err
)) {
1047 } else if (targetByteUnit
!= missingCharMarker
) {
1048 if (targetByteUnit
==ISCII_HALANT
) {
1049 converterData
->contextCharFromUnicode
= (UChar
)targetByteUnit
;
1051 /* write targetByteUnit to target*/
1052 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
1053 if (U_FAILURE(*err
)) {
1057 /* oops.. the code point is unassigned */
1058 /*check if the char is a First surrogate*/
1059 if (U16_IS_SURROGATE(sourceChar
)) {
1060 if (U16_IS_SURROGATE_LEAD(sourceChar
)) {
1062 /*look ahead to find the trail surrogate*/
1063 if (source
< sourceLimit
) {
1064 /* test the following code unit */
1065 UChar trail
= (*source
);
1066 if (U16_IS_TRAIL(trail
)) {
1068 sourceChar
=U16_GET_SUPPLEMENTARY(sourceChar
, trail
);
1069 *err
=U_INVALID_CHAR_FOUND
;
1070 /* convert this surrogate code point */
1071 /* exit this condition tree */
1073 /* this is an unmatched lead code unit (1st surrogate) */
1074 /* callback(illegal) */
1075 *err
=U_ILLEGAL_CHAR_FOUND
;
1079 *err
= U_ZERO_ERROR
;
1082 /* this is an unmatched trail code unit (2nd surrogate) */
1083 /* callback(illegal) */
1084 *err
=U_ILLEGAL_CHAR_FOUND
;
1087 /* callback(unassigned) for a BMP code point */
1088 *err
= U_INVALID_CHAR_FOUND
;
1091 args
->converter
->fromUChar32
=sourceChar
;
1094 }/* end while(mySourceIndex<mySourceLength) */
1096 /*save the state and return */
1097 args
->source
= source
;
1098 args
->target
= (char*)target
;
1101 static const uint16_t lookupTable
[][2]={
1102 { ZERO
, ZERO
}, /*DEFALT*/
1103 { ZERO
, ZERO
}, /*ROMAN*/
1104 { DEVANAGARI
, DEV_MASK
},
1105 { BENGALI
, BNG_MASK
},
1106 { TAMIL
, TML_MASK
},
1107 { TELUGU
, KND_MASK
},
1108 { BENGALI
, BNG_MASK
},
1109 { ORIYA
, ORI_MASK
},
1110 { KANNADA
, KND_MASK
},
1111 { MALAYALAM
, MLM_MASK
},
1112 { GUJARATI
, GJR_MASK
},
1113 { GURMUKHI
, PNJ_MASK
}
1116 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
1117 /* add offset to current Indic Block */ \
1118 if(targetUniChar>ASCII_END && \
1119 targetUniChar != ZWJ && \
1120 targetUniChar != ZWNJ && \
1121 targetUniChar != DANDA && \
1122 targetUniChar != DOUBLE_DANDA){ \
1124 targetUniChar+=(uint16_t)(delta); \
1126 /* now write the targetUniChar */ \
1127 if(target<args->targetLimit){ \
1128 *(target)++ = (UChar)targetUniChar; \
1130 *(offsets)++ = (int32_t)(offset); \
1133 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
1134 (UChar)targetUniChar; \
1135 *err = U_BUFFER_OVERFLOW_ERROR; \
1139 #define GET_MAPPING(sourceChar,targetUniChar,data){ \
1140 targetUniChar = toUnicodeTable[(sourceChar)] ; \
1141 /* is the code point valid in current script? */ \
1142 if(sourceChar> ASCII_END && \
1143 (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \
1144 /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
1145 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
1146 targetUniChar!=VOCALLIC_RR){ \
1147 targetUniChar=missingCharMarker; \
1153 * Rules for ISCII to Unicode converter
1154 * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
1155 * which has both precomposed and decomposed forms characters
1156 * pre-context and post-context need to be considered.
1159 * i) ATR : Attribute code is used to declare the font and script switching.
1160 * Currently we only switch scripts and font codes consumed without generating an error
1161 * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
1162 * obsolete characters
1164 * i) Halant: if preceeded by a halant then it is a explicit halant
1166 * a) if preceeded by a halant then it is a soft halant
1167 * b) if preceeded by specific consonants and the ligatures have pre-composed
1168 * characters in Unicode then convert to pre-composed characters
1169 * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
1173 static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
, UErrorCode
* err
) {
1174 const char *source
= ( char *) args
->source
;
1175 UChar
*target
= args
->target
;
1176 const char *sourceLimit
= args
->sourceLimit
;
1177 const UChar
* targetLimit
= args
->targetLimit
;
1178 uint32_t targetUniChar
= 0x0000;
1179 uint8_t sourceChar
= 0x0000;
1180 UConverterDataISCII
* data
;
1181 UChar32
* toUnicodeStatus
=NULL
;
1182 UChar32 tempTargetUniChar
= 0x0000;
1183 UChar
* contextCharToUnicode
= NULL
;
1188 if ((args
->converter
== NULL
) || (target
< args
->target
) || (source
< args
->source
)) {
1189 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1193 data
= (UConverterDataISCII
*)(args
->converter
->extraInfo
);
1194 contextCharToUnicode
= &data
->contextCharToUnicode
; /* contains previous ISCII codepoint visited */
1195 toUnicodeStatus
= (UChar32
*)&args
->converter
->toUnicodeStatus
;/* contains the mapping to Unicode of the above codepoint*/
1197 while (U_SUCCESS(*err
) && source
<sourceLimit
) {
1199 targetUniChar
= missingCharMarker
;
1201 if (target
< targetLimit
) {
1202 sourceChar
= (unsigned char)*(source
)++;
1204 /* look at the post-context preform special processing */
1205 if (*contextCharToUnicode
==ATR
) {
1207 /* If we have ATR in *contextCharToUnicode then we need to change our
1208 * state to the Indic Script specified by sourceChar
1211 /* check if the sourceChar is supported script range*/
1212 if ((uint8_t)(PNJ
-sourceChar
)<=PNJ
-DEV
) {
1213 data
->currentDeltaToUnicode
= (uint16_t)(lookupTable
[sourceChar
& 0x0F][0] * DELTA
);
1214 data
->currentMaskToUnicode
= (MaskEnum
)lookupTable
[sourceChar
& 0x0F][1];
1215 } else if (sourceChar
==DEF
) {
1216 /* switch back to default */
1217 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1218 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1220 if ((sourceChar
>= 0x21 && sourceChar
<= 0x3F)) {
1221 /* these are display codes consume and continue */
1223 *err
=U_ILLEGAL_CHAR_FOUND
;
1225 *contextCharToUnicode
=NO_CHAR_MARKER
;
1231 *contextCharToUnicode
=NO_CHAR_MARKER
;
1235 } else if (*contextCharToUnicode
==EXT
) {
1236 /* check if sourceChar is in 0xA1-0xEE range */
1237 if ((uint8_t) (EXT_RANGE_END
- sourceChar
) <= (EXT_RANGE_END
- EXT_RANGE_BEGIN
)) {
1238 /* We currently support only Anudatta and Devanagari abbreviation sign */
1239 if (sourceChar
==0xBF || sourceChar
== 0xB8) {
1240 targetUniChar
= (sourceChar
==0xBF) ? DEV_ABBR_SIGN
: DEV_ANUDATTA
;
1242 /* find out if the mapping is valid in this state */
1243 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1244 *contextCharToUnicode
= NO_CHAR_MARKER
;
1246 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1247 if (data
->prevToUnicodeStatus
) {
1248 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1249 data
->prevToUnicodeStatus
= 0x0000;
1251 /* write to target */
1252 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1257 /* byte unit is unassigned */
1258 targetUniChar
= missingCharMarker
;
1259 *err
= U_INVALID_CHAR_FOUND
;
1261 /* only 0xA1 - 0xEE are legal after EXT char */
1262 *contextCharToUnicode
= NO_CHAR_MARKER
;
1263 *err
= U_ILLEGAL_CHAR_FOUND
;
1266 } else if (*contextCharToUnicode
==ISCII_INV
) {
1267 if (sourceChar
==ISCII_HALANT
) {
1268 targetUniChar
= 0x0020; /* replace with space accoding to Indic FAQ */
1270 targetUniChar
= ZWJ
;
1273 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1274 if (data
->prevToUnicodeStatus
) {
1275 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1276 data
->prevToUnicodeStatus
= 0x0000;
1278 /* write to target */
1279 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1281 *contextCharToUnicode
=NO_CHAR_MARKER
;
1284 /* look at the pre-context and perform special processing */
1285 switch (sourceChar
) {
1287 case EXT
: /*falls through*/
1289 *contextCharToUnicode
= (UChar
)sourceChar
;
1291 if (*toUnicodeStatus
!= missingCharMarker
) {
1292 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1293 if (data
->prevToUnicodeStatus
) {
1294 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1295 data
->prevToUnicodeStatus
= 0x0000;
1297 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1298 *toUnicodeStatus
= missingCharMarker
;
1302 /* handle double danda*/
1303 if (*contextCharToUnicode
== ISCII_DANDA
) {
1304 targetUniChar
= DOUBLE_DANDA
;
1305 /* clear the context */
1306 *contextCharToUnicode
= NO_CHAR_MARKER
;
1307 *toUnicodeStatus
= missingCharMarker
;
1309 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1310 *contextCharToUnicode
= sourceChar
;
1314 /* handle explicit halant */
1315 if (*contextCharToUnicode
== ISCII_HALANT
) {
1316 targetUniChar
= ZWNJ
;
1317 /* clear the context */
1318 *contextCharToUnicode
= NO_CHAR_MARKER
;
1320 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1321 *contextCharToUnicode
= sourceChar
;
1327 data
->resetToDefaultToUnicode
= TRUE
;
1328 GET_MAPPING(sourceChar
,targetUniChar
,data
)
1330 *contextCharToUnicode
= sourceChar
;
1333 case ISCII_VOWEL_SIGN_E
:
1336 for (; i
<vowelSignESpecialCases
[0][0]; i
++) {
1337 U_ASSERT(i
<sizeof(vowelSignESpecialCases
)/sizeof(vowelSignESpecialCases
[0]));
1338 if (vowelSignESpecialCases
[i
][0]==(uint8_t)*contextCharToUnicode
) {
1339 targetUniChar
=vowelSignESpecialCases
[i
][1];
1345 /* find out if the mapping is valid in this state */
1346 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1347 /*targetUniChar += data->currentDeltaToUnicode ;*/
1348 *contextCharToUnicode
= NO_CHAR_MARKER
;
1349 *toUnicodeStatus
= missingCharMarker
;
1353 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1354 *contextCharToUnicode
= sourceChar
;
1358 /* handle soft halant */
1359 if (*contextCharToUnicode
== ISCII_HALANT
) {
1360 targetUniChar
= ZWJ
;
1361 /* clear the context */
1362 *contextCharToUnicode
= NO_CHAR_MARKER
;
1364 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->contextCharToUnicode
== 0xc0) {
1365 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1366 if (data
->prevToUnicodeStatus
) {
1367 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1368 data
->prevToUnicodeStatus
= 0x0000;
1370 /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
1371 * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
1373 targetUniChar
= PNJ_RRA
;
1374 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1375 if (U_SUCCESS(*err
)) {
1376 targetUniChar
= PNJ_SIGN_VIRAMA
;
1377 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1378 if (U_SUCCESS(*err
)) {
1379 targetUniChar
= PNJ_HA
;
1380 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1382 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1385 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_SIGN_VIRAMA
;
1386 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1388 *toUnicodeStatus
= missingCharMarker
;
1389 data
->contextCharToUnicode
= NO_CHAR_MARKER
;
1392 /* try to handle <CHAR> + ISCII_NUKTA special mappings */
1395 for (; i
<nuktaSpecialCases
[0][0]; i
++) {
1396 if (nuktaSpecialCases
[i
][0]==(uint8_t)
1397 *contextCharToUnicode
) {
1398 targetUniChar
=nuktaSpecialCases
[i
][1];
1404 /* find out if the mapping is valid in this state */
1405 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1406 /*targetUniChar += data->currentDeltaToUnicode ;*/
1407 *contextCharToUnicode
= NO_CHAR_MARKER
;
1408 *toUnicodeStatus
= missingCharMarker
;
1409 if (data
->currentDeltaToUnicode
== PNJ_DELTA
) {
1410 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1411 if (data
->prevToUnicodeStatus
) {
1412 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1413 data
->prevToUnicodeStatus
= 0x0000;
1415 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1420 /* else fall through to default */
1422 /* else fall through to default */
1424 default:GET_MAPPING(sourceChar
,targetUniChar
,data
)
1426 *contextCharToUnicode
= sourceChar
;
1430 if (*toUnicodeStatus
!= missingCharMarker
) {
1431 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
1432 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->prevToUnicodeStatus
!= 0 && isPNJConsonant(data
->prevToUnicodeStatus
) &&
1433 (*toUnicodeStatus
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& (targetUniChar
+ PNJ_DELTA
) == data
->prevToUnicodeStatus
) {
1434 /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
1435 offset
= (int)(source
-args
->source
- 3);
1436 tempTargetUniChar
= PNJ_ADHAK
; /* This is necessary to avoid some compiler warnings. */
1437 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,tempTargetUniChar
,0,err
);
1438 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,data
->prevToUnicodeStatus
,0,err
);
1439 data
->prevToUnicodeStatus
= 0x0000; /* reset the previous unicode code point */
1440 *toUnicodeStatus
= missingCharMarker
;
1443 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1444 if (data
->prevToUnicodeStatus
) {
1445 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1446 data
->prevToUnicodeStatus
= 0x0000;
1448 /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
1449 * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
1451 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_BINDI
&& isPNJBindiTippi((*toUnicodeStatus
+ PNJ_DELTA
))) {
1452 targetUniChar
= PNJ_TIPPI
- PNJ_DELTA
;
1453 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,PNJ_DELTA
,err
);
1454 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& isPNJConsonant((*toUnicodeStatus
+ PNJ_DELTA
))) {
1455 /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
1456 data
->prevToUnicodeStatus
= *toUnicodeStatus
+ PNJ_DELTA
;
1458 /* write the previously mapped codepoint */
1459 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1462 *toUnicodeStatus
= missingCharMarker
;
1465 if (targetUniChar
!= missingCharMarker
) {
1466 /* now save the targetUniChar for delayed write */
1467 *toUnicodeStatus
= (UChar
) targetUniChar
;
1468 if (data
->resetToDefaultToUnicode
==TRUE
) {
1469 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1470 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1471 data
->resetToDefaultToUnicode
=FALSE
;
1475 /* we reach here only if targetUniChar == missingCharMarker
1476 * so assign codes to reason and err
1478 *err
= U_INVALID_CHAR_FOUND
;
1480 args
->converter
->toUBytes
[0] = (uint8_t) sourceChar
;
1481 args
->converter
->toULength
= 1;
1486 *err
=U_BUFFER_OVERFLOW_ERROR
;
1491 if (U_SUCCESS(*err
) && args
->flush
&& source
== sourceLimit
) {
1492 /* end of the input stream */
1493 UConverter
*cnv
= args
->converter
;
1495 if (*contextCharToUnicode
==ATR
|| *contextCharToUnicode
==EXT
|| *contextCharToUnicode
==ISCII_INV
) {
1496 /* set toUBytes[] */
1497 cnv
->toUBytes
[0] = (uint8_t)*contextCharToUnicode
;
1500 /* avoid looping on truncated sequences */
1501 *contextCharToUnicode
= NO_CHAR_MARKER
;
1506 if (*toUnicodeStatus
!= missingCharMarker
) {
1507 /* output a remaining target character */
1508 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
- args
->source
-1),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1509 *toUnicodeStatus
= missingCharMarker
;
1513 args
->target
= target
;
1514 args
->source
= source
;
1517 /* structure for SafeClone calculations */
1518 struct cloneISCIIStruct
{
1520 UConverterDataISCII mydata
;
1524 _ISCII_SafeClone(const UConverter
*cnv
,
1526 int32_t *pBufferSize
,
1529 struct cloneISCIIStruct
* localClone
;
1530 int32_t bufferSizeNeeded
= sizeof(struct cloneISCIIStruct
);
1532 if (U_FAILURE(*status
)) {
1536 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
1537 *pBufferSize
= bufferSizeNeeded
;
1541 localClone
= (struct cloneISCIIStruct
*)stackBuffer
;
1542 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1544 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataISCII
));
1545 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1546 localClone
->cnv
.isExtraLocal
= TRUE
;
1548 return &localClone
->cnv
;
1552 _ISCIIGetUnicodeSet(const UConverter
*cnv
,
1553 const USetAdder
*sa
,
1554 UConverterUnicodeSet which
,
1555 UErrorCode
*pErrorCode
)
1557 int32_t idx
, script
;
1560 /* Since all ISCII versions allow switching to other ISCII
1561 scripts, we add all roundtrippable characters to this set. */
1562 sa
->addRange(sa
->set
, 0, ASCII_END
);
1563 for (script
= DEVANAGARI
; script
<= MALAYALAM
; script
++) {
1564 mask
= (uint8_t)(lookupInitialData
[script
].maskEnum
);
1565 for (idx
= 0; idx
< DELTA
; idx
++) {
1566 /* added check for TELUGU character */
1567 if ((validityTable
[idx
] & mask
) || (script
==TELUGU
&& idx
==0x31)) {
1568 sa
->add(sa
->set
, idx
+ (script
* DELTA
) + INDIC_BLOCK_BEGIN
);
1572 sa
->add(sa
->set
, DANDA
);
1573 sa
->add(sa
->set
, DOUBLE_DANDA
);
1574 sa
->add(sa
->set
, ZWNJ
);
1575 sa
->add(sa
->set
, ZWJ
);
1578 static const UConverterImpl _ISCIIImpl
={
1589 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1590 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1591 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1592 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1602 static const UConverterStaticData _ISCIIStaticData
={
1603 sizeof(UConverterStaticData
),
1616 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
1620 const UConverterSharedData _ISCIIData
={
1621 sizeof(UConverterSharedData
),
1631 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */