1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 **********************************************************************
5 * Copyright (C) 2000-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: ucnvisci.c
10 * tab size: 8 (not used)
13 * created on: 2001JUN26
14 * created by: Ram Viswanadha
16 * Date Name Description
17 * 24/7/2001 Ram Added support for EXT character handling
20 #include "unicode/utypes.h"
22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
24 #include "unicode/ucnv.h"
25 #include "unicode/ucnv_cb.h"
26 #include "unicode/utf16.h"
33 #define UCNV_OPTIONS_VERSION_MASK 0xf
36 #define ZWNJ 0x200c /* Zero Width Non Joiner */
37 #define ZWJ 0x200d /* Zero width Joiner */
38 #define INVALID_CHAR 0xffff
39 #define ATR 0xEF /* Attribute code */
40 #define EXT 0xF0 /* Extension code */
42 #define DOUBLE_DANDA 0x0965
43 #define ISCII_NUKTA 0xE9
44 #define ISCII_HALANT 0xE8
45 #define ISCII_DANDA 0xEA
46 #define ISCII_INV 0xD9
47 #define ISCII_VOWEL_SIGN_E 0xE0
48 #define INDIC_BLOCK_BEGIN 0x0900
49 #define INDIC_BLOCK_END 0x0D7F
50 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
51 #define VOCALLIC_RR 0x0931
53 #define ASCII_END 0xA0
54 #define NO_CHAR_MARKER 0xFFFE
55 #define TELUGU_DELTA DELTA * TELUGU
56 #define DEV_ABBR_SIGN 0x0970
57 #define DEV_ANUDATTA 0x0952
58 #define EXT_RANGE_BEGIN 0xA1
59 #define EXT_RANGE_END 0xEE
61 #define PNJ_DELTA 0x0100
62 #define PNJ_BINDI 0x0A02
63 #define PNJ_TIPPI 0x0A70
64 #define PNJ_SIGN_VIRAMA 0x0A4D
65 #define PNJ_ADHAK 0x0A71
67 #define PNJ_RRA 0x0A5C
83 * Enumeration for switching code pages if <ATR>+<one of below values>
119 #define ISCII_CNV_PREFIX "ISCII,version="
122 UChar contextCharToUnicode
; /* previous Unicode codepoint for contextual analysis */
123 UChar contextCharFromUnicode
; /* previous Unicode codepoint for contextual analysis */
124 uint16_t defDeltaToUnicode
; /* delta for switching to default state when DEF is encountered */
125 uint16_t currentDeltaFromUnicode
; /* current delta in Indic block */
126 uint16_t currentDeltaToUnicode
; /* current delta in Indic block */
127 MaskEnum currentMaskFromUnicode
; /* mask for current state in toUnicode */
128 MaskEnum currentMaskToUnicode
; /* mask for current state in toUnicode */
129 MaskEnum defMaskToUnicode
; /* mask for default state in toUnicode */
130 UBool isFirstBuffer
; /* boolean for fromUnicode to see if we need to announce the first script */
131 UBool resetToDefaultToUnicode
; /* boolean for reseting to default delta and mask when a newline is encountered*/
132 char name
[sizeof(ISCII_CNV_PREFIX
) + 1];
133 UChar32 prevToUnicodeStatus
; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
134 } UConverterDataISCII
;
136 typedef struct LookupDataStruct
{
142 static const LookupDataStruct lookupInitialData
[]={
143 { DEVANAGARI
, DEV_MASK
, DEV
},
144 { BENGALI
, BNG_MASK
, BNG
},
145 { GURMUKHI
, PNJ_MASK
, PNJ
},
146 { GUJARATI
, GJR_MASK
, GJR
},
147 { ORIYA
, ORI_MASK
, ORI
},
148 { TAMIL
, TML_MASK
, TML
},
149 { TELUGU
, KND_MASK
, TLG
},
150 { KANNADA
, KND_MASK
, KND
},
151 { MALAYALAM
, MLM_MASK
, MLM
}
155 * For special handling of certain Gurmukhi characters.
156 * Bit 0 (value 1): PNJ consonant
157 * Bit 1 (value 2): PNJ Bindi Tippi
159 static const uint8_t pnjMap
[80] = {
161 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
165 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3,
167 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2,
169 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
173 isPNJConsonant(UChar32 c
) {
174 if (c
< 0xa00 || 0xa50 <= c
) {
177 return (UBool
)(pnjMap
[c
- 0xa00] & 1);
182 isPNJBindiTippi(UChar32 c
) {
183 if (c
< 0xa00 || 0xa50 <= c
) {
186 return (UBool
)(pnjMap
[c
- 0xa00] >> 1);
190 static void U_CALLCONV
191 _ISCIIOpen(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
) {
192 if(pArgs
->onlyTestIsLoadable
) {
196 cnv
->extraInfo
= uprv_malloc(sizeof(UConverterDataISCII
));
198 if (cnv
->extraInfo
!= NULL
) {
200 UConverterDataISCII
*converterData
=
201 (UConverterDataISCII
*) cnv
->extraInfo
;
202 converterData
->contextCharToUnicode
=NO_CHAR_MARKER
;
203 cnv
->toUnicodeStatus
= missingCharMarker
;
204 converterData
->contextCharFromUnicode
=0x0000;
205 converterData
->resetToDefaultToUnicode
=FALSE
;
206 /* check if the version requested is supported */
207 if ((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) < 9) {
208 /* initialize state variables */
209 converterData
->currentDeltaFromUnicode
210 = converterData
->currentDeltaToUnicode
211 = converterData
->defDeltaToUnicode
= (uint16_t)(lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].uniLang
* DELTA
);
213 converterData
->currentMaskFromUnicode
214 = converterData
->currentMaskToUnicode
215 = converterData
->defMaskToUnicode
= lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].maskEnum
;
217 converterData
->isFirstBuffer
=TRUE
;
218 (void)uprv_strcpy(converterData
->name
, ISCII_CNV_PREFIX
);
219 len
= (int32_t)uprv_strlen(converterData
->name
);
220 converterData
->name
[len
]= (char)((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) + '0');
221 converterData
->name
[len
+1]=0;
223 converterData
->prevToUnicodeStatus
= 0x0000;
225 uprv_free(cnv
->extraInfo
);
226 cnv
->extraInfo
= NULL
;
227 *errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
231 *errorCode
=U_MEMORY_ALLOCATION_ERROR
;
235 static void U_CALLCONV
236 _ISCIIClose(UConverter
*cnv
) {
237 if (cnv
->extraInfo
!=NULL
) {
238 if (!cnv
->isExtraLocal
) {
239 uprv_free(cnv
->extraInfo
);
245 static const char* U_CALLCONV
246 _ISCIIgetName(const UConverter
* cnv
) {
247 if (cnv
->extraInfo
) {
248 UConverterDataISCII
* myData
= (UConverterDataISCII
*)cnv
->extraInfo
;
254 static void U_CALLCONV
255 _ISCIIReset(UConverter
*cnv
, UConverterResetChoice choice
) {
256 UConverterDataISCII
* data
=(UConverterDataISCII
*) (cnv
->extraInfo
);
257 if (choice
<=UCNV_RESET_TO_UNICODE
) {
258 cnv
->toUnicodeStatus
= missingCharMarker
;
260 data
->currentDeltaToUnicode
=data
->defDeltaToUnicode
;
261 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
262 data
->contextCharToUnicode
=NO_CHAR_MARKER
;
263 data
->prevToUnicodeStatus
= 0x0000;
265 if (choice
!=UCNV_RESET_TO_UNICODE
) {
266 cnv
->fromUChar32
=0x0000;
267 data
->contextCharFromUnicode
=0x00;
268 data
->currentMaskFromUnicode
=data
->defMaskToUnicode
;
269 data
->currentDeltaFromUnicode
=data
->defDeltaToUnicode
;
270 data
->isFirstBuffer
=TRUE
;
271 data
->resetToDefaultToUnicode
=FALSE
;
276 * The values in validity table are indexed by the lower bits of Unicode
277 * range 0x0900 - 0x09ff. The values have a structure like:
278 * ---------------------------------------------------------------
279 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
280 * | | | | | ASM | KND | | |
281 * ---------------------------------------------------------------
282 * If a code point is valid in a particular script
283 * then that bit is turned on
285 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
286 * to represent these languages
288 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
289 * and combine and use 1 bit to represent these languages.
291 * TODO: It is probably easier to understand and maintain to change this
292 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
295 static const uint8_t validityTable
[128] = {
296 /* This state table is tool generated please do not edit unless you know exactly what you are doing */
297 /* Note: This table was edited to mirror the Windows XP implementation */
298 /*ISCII:Valid:Unicode */
299 /*0xa0 : 0x00: 0x900 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
300 /*0xa1 : 0xb8: 0x901 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
301 /*0xa2 : 0xfe: 0x902 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
302 /*0xa3 : 0xbf: 0x903 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
303 /*0x00 : 0x00: 0x904 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
304 /*0xa4 : 0xff: 0x905 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
305 /*0xa5 : 0xff: 0x906 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
306 /*0xa6 : 0xff: 0x907 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
307 /*0xa7 : 0xff: 0x908 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
308 /*0xa8 : 0xff: 0x909 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
309 /*0xa9 : 0xff: 0x90a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
310 /*0xaa : 0xfe: 0x90b */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
311 /*0x00 : 0x00: 0x90c */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
312 /*0xae : 0x80: 0x90d */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
313 /*0xab : 0x87: 0x90e */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
314 /*0xac : 0xff: 0x90f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
315 /*0xad : 0xff: 0x910 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
316 /*0xb2 : 0x80: 0x911 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
317 /*0xaf : 0x87: 0x912 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
318 /*0xb0 : 0xff: 0x913 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
319 /*0xb1 : 0xff: 0x914 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
320 /*0xb3 : 0xff: 0x915 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
321 /*0xb4 : 0xfe: 0x916 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
322 /*0xb5 : 0xfe: 0x917 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
323 /*0xb6 : 0xfe: 0x918 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
324 /*0xb7 : 0xff: 0x919 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
325 /*0xb8 : 0xff: 0x91a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
326 /*0xb9 : 0xfe: 0x91b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
327 /*0xba : 0xff: 0x91c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
328 /*0xbb : 0xfe: 0x91d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
329 /*0xbc : 0xff: 0x91e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
330 /*0xbd : 0xff: 0x91f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
331 /*0xbe : 0xfe: 0x920 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
332 /*0xbf : 0xfe: 0x921 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
333 /*0xc0 : 0xfe: 0x922 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
334 /*0xc1 : 0xff: 0x923 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
335 /*0xc2 : 0xff: 0x924 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
336 /*0xc3 : 0xfe: 0x925 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
337 /*0xc4 : 0xfe: 0x926 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
338 /*0xc5 : 0xfe: 0x927 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
339 /*0xc6 : 0xff: 0x928 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
340 /*0xc7 : 0x81: 0x929 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ TML_MASK
,
341 /*0xc8 : 0xff: 0x92a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
342 /*0xc9 : 0xfe: 0x92b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
343 /*0xca : 0xfe: 0x92c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
344 /*0xcb : 0xfe: 0x92d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
345 /*0xcc : 0xfe: 0x92e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
346 /*0xcd : 0xff: 0x92f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
347 /*0xcf : 0xff: 0x930 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
348 /*0xd0 : 0x87: 0x931 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
349 /*0xd1 : 0xff: 0x932 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
350 /*0xd2 : 0xb7: 0x933 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
351 /*0xd3 : 0x83: 0x934 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
352 /*0xd4 : 0xff: 0x935 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
353 /*0xd5 : 0xfe: 0x936 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
354 /*0xd6 : 0xbf: 0x937 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
355 /*0xd7 : 0xff: 0x938 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
356 /*0xd8 : 0xff: 0x939 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
357 /*0x00 : 0x00: 0x93A */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
358 /*0x00 : 0x00: 0x93B */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
359 /*0xe9 : 0xda: 0x93c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
360 /*0x00 : 0x00: 0x93d */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
361 /*0xda : 0xff: 0x93e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
362 /*0xdb : 0xff: 0x93f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
363 /*0xdc : 0xff: 0x940 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
364 /*0xdd : 0xff: 0x941 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
365 /*0xde : 0xff: 0x942 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
366 /*0xdf : 0xbe: 0x943 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
367 /*0x00 : 0x00: 0x944 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ BNG_MASK
+ KND_MASK
+ ZERO
+ ZERO
,
368 /*0xe3 : 0x80: 0x945 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
369 /*0xe0 : 0x87: 0x946 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
370 /*0xe1 : 0xff: 0x947 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
371 /*0xe2 : 0xff: 0x948 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
372 /*0xe7 : 0x80: 0x949 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
373 /*0xe4 : 0x87: 0x94a */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
374 /*0xe5 : 0xff: 0x94b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
375 /*0xe6 : 0xff: 0x94c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
376 /*0xe8 : 0xff: 0x94d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
377 /*0xec : 0x00: 0x94e */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
378 /*0xed : 0x00: 0x94f */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
379 /*0x00 : 0x00: 0x950 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
380 /*0x00 : 0x00: 0x951 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
381 /*0x00 : 0x00: 0x952 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
382 /*0x00 : 0x00: 0x953 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
383 /*0x00 : 0x00: 0x954 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
384 /*0x00 : 0x00: 0x955 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
385 /*0x00 : 0x00: 0x956 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
386 /*0x00 : 0x00: 0x957 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ MLM_MASK
+ ZERO
,
387 /*0x00 : 0x00: 0x958 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
388 /*0x00 : 0x00: 0x959 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
389 /*0x00 : 0x00: 0x95a */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
390 /*0x00 : 0x00: 0x95b */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
391 /*0x00 : 0x00: 0x95c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
392 /*0x00 : 0x00: 0x95d */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
393 /*0x00 : 0x00: 0x95e */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
394 /*0xce : 0x98: 0x95f */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
395 /*0x00 : 0x00: 0x960 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
396 /*0x00 : 0x00: 0x961 */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
397 /*0x00 : 0x00: 0x962 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
398 /*0x00 : 0x00: 0x963 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
399 /*0xea : 0xf8: 0x964 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
400 /*0xeaea : 0x00: 0x965*/ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
401 /*0xf1 : 0xff: 0x966 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
402 /*0xf2 : 0xff: 0x967 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
403 /*0xf3 : 0xff: 0x968 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
404 /*0xf4 : 0xff: 0x969 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
405 /*0xf5 : 0xff: 0x96a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
406 /*0xf6 : 0xff: 0x96b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
407 /*0xf7 : 0xff: 0x96c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
408 /*0xf8 : 0xff: 0x96d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
409 /*0xf9 : 0xff: 0x96e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
410 /*0xfa : 0xff: 0x96f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
411 /*0x00 : 0x80: 0x970 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
413 * The length of the array is 128 to provide values for 0x900..0x97f.
414 * The last 15 entries for 0x971..0x97f of the validity table are all zero
415 * because no Indic script uses such Unicode code points.
417 /*0x00 : 0x00: 0x9yz */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
420 static const uint16_t fromUnicodeTable
[128]={
501 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
550 static const uint16_t toUnicodeTable
[256]={
809 static const uint16_t vowelSignESpecialCases
[][2]={
810 { 2 /*length of array*/ , 0 },
814 static const uint16_t nuktaSpecialCases
[][2]={
815 { 16 /*length of array*/ , 0 },
834 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
835 int32_t offset = (int32_t)(source - args->source-1); \
836 /* write the targetUniChar to target */ \
837 if(target < targetLimit){ \
838 if(targetByteUnit <= 0xFF){ \
839 *(target)++ = (uint8_t)(targetByteUnit); \
841 *(offsets++) = offset; \
844 if (targetByteUnit > 0xFFFF) { \
845 *(target)++ = (uint8_t)(targetByteUnit>>16); \
848 *(offsets++) = offset; \
851 if (!(target < targetLimit)) { \
852 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
853 (uint8_t)(targetByteUnit >> 8); \
854 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
855 (uint8_t)targetByteUnit; \
856 *err = U_BUFFER_OVERFLOW_ERROR; \
858 *(target)++ = (uint8_t)(targetByteUnit>>8); \
860 *(offsets++) = offset; \
862 if(target < targetLimit){ \
863 *(target)++ = (uint8_t) targetByteUnit; \
865 *(offsets++) = offset ; \
868 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
869 (uint8_t) (targetByteUnit); \
870 *err = U_BUFFER_OVERFLOW_ERROR; \
875 if (targetByteUnit & 0xFF0000) { \
876 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
877 (uint8_t) (targetByteUnit >>16); \
879 if(targetByteUnit & 0xFF00){ \
880 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
881 (uint8_t) (targetByteUnit >>8); \
883 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
884 (uint8_t) (targetByteUnit); \
885 *err = U_BUFFER_OVERFLOW_ERROR; \
895 static void U_CALLCONV
896 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
897 UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
898 const UChar
*source
= args
->source
;
899 const UChar
*sourceLimit
= args
->sourceLimit
;
900 unsigned char *target
= (unsigned char *) args
->target
;
901 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
902 int32_t* offsets
= args
->offsets
;
903 uint32_t targetByteUnit
= 0x0000;
904 UChar32 sourceChar
= 0x0000;
905 UChar32 tempContextFromUnicode
= 0x0000; /* For special handling of the Gurmukhi script. */
906 UConverterDataISCII
*converterData
;
909 UBool deltaChanged
= FALSE
;
911 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)) {
912 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
915 /* initialize data */
916 converterData
=(UConverterDataISCII
*)args
->converter
->extraInfo
;
917 newDelta
=converterData
->currentDeltaFromUnicode
;
918 range
= (uint16_t)(newDelta
/DELTA
);
920 if ((sourceChar
= args
->converter
->fromUChar32
)!=0) {
924 /*writing the char to the output stream */
925 while (source
< sourceLimit
) {
926 /* Write the language code following LF only if LF is not the last character. */
927 if (args
->converter
->fromUnicodeStatus
== LF
) {
928 targetByteUnit
= ATR
<<8;
929 targetByteUnit
+= (uint8_t) lookupInitialData
[range
].isciiLang
;
930 args
->converter
->fromUnicodeStatus
= 0x0000;
931 /* now append ATR and language code */
932 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
933 if (U_FAILURE(*err
)) {
938 sourceChar
= *source
++;
939 tempContextFromUnicode
= converterData
->contextCharFromUnicode
;
941 targetByteUnit
= missingCharMarker
;
943 /*check if input is in ASCII and C0 control codes range*/
944 if (sourceChar
<= ASCII_END
) {
945 args
->converter
->fromUnicodeStatus
= sourceChar
;
946 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,sourceChar
,err
);
947 if (U_FAILURE(*err
)) {
952 switch (sourceChar
) {
954 /* contextChar has HALANT */
955 if (converterData
->contextCharFromUnicode
) {
956 converterData
->contextCharFromUnicode
= 0x00;
957 targetByteUnit
= ISCII_HALANT
;
959 /* consume ZWNJ and continue */
960 converterData
->contextCharFromUnicode
= 0x00;
965 /* contextChar has HALANT */
966 if (converterData
->contextCharFromUnicode
) {
967 targetByteUnit
= ISCII_NUKTA
;
969 targetByteUnit
=ISCII_INV
;
971 converterData
->contextCharFromUnicode
= 0x00;
974 /* is the sourceChar in the INDIC_RANGE? */
975 if ((uint16_t)(INDIC_BLOCK_END
-sourceChar
) <= INDIC_RANGE
) {
976 /* Danda and Double Danda are valid in Northern scripts.. since Unicode
977 * does not include these codepoints in all Northern scrips we need to
980 if (sourceChar
!= DANDA
&& sourceChar
!= DOUBLE_DANDA
) {
981 /* find out to which block the souceChar belongs*/
982 range
=(uint16_t)((sourceChar
-INDIC_BLOCK_BEGIN
)/DELTA
);
983 newDelta
=(uint16_t)(range
*DELTA
);
985 /* Now are we in the same block as the previous? */
986 if (newDelta
!= converterData
->currentDeltaFromUnicode
|| converterData
->isFirstBuffer
) {
987 converterData
->currentDeltaFromUnicode
= newDelta
;
988 converterData
->currentMaskFromUnicode
= lookupInitialData
[range
].maskEnum
;
990 converterData
->isFirstBuffer
=FALSE
;
993 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
) {
994 if (sourceChar
== PNJ_TIPPI
) {
995 /* Make sure Tippi is converterd to Bindi. */
996 sourceChar
= PNJ_BINDI
;
997 } else if (sourceChar
== PNJ_ADHAK
) {
998 /* This is for consonant cluster handling. */
999 converterData
->contextCharFromUnicode
= PNJ_ADHAK
;
1003 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
1004 /* now subtract the new delta from sourceChar*/
1005 sourceChar
-= converterData
->currentDeltaFromUnicode
;
1008 /* get the target byte unit */
1009 targetByteUnit
=fromUnicodeTable
[(uint8_t)sourceChar
];
1011 /* is the code point valid in current script? */
1012 if ((validityTable
[(uint8_t)sourceChar
] & converterData
->currentMaskFromUnicode
)==0) {
1013 /* Vocallic RR is assigned in ISCII Telugu and Unicode */
1014 if (converterData
->currentDeltaFromUnicode
!=(TELUGU_DELTA
) || sourceChar
!=VOCALLIC_RR
) {
1015 targetByteUnit
=missingCharMarker
;
1020 /* we are in a script block which is different than
1021 * previous sourceChar's script block write ATR and language codes
1024 temp
=(uint16_t)(ATR
<<8);
1025 temp
+= (uint16_t)((uint8_t) lookupInitialData
[range
].isciiLang
);
1028 /* now append ATR and language code */
1029 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,temp
,err
);
1030 if (U_FAILURE(*err
)) {
1035 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& (sourceChar
+ PNJ_DELTA
) == PNJ_ADHAK
) {
1039 /* reset context char */
1040 converterData
->contextCharFromUnicode
= 0x00;
1043 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& tempContextFromUnicode
== PNJ_ADHAK
&& isPNJConsonant((sourceChar
+ PNJ_DELTA
))) {
1044 /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
1045 /* reset context char */
1046 converterData
->contextCharFromUnicode
= 0x0000;
1047 targetByteUnit
= targetByteUnit
<< 16 | ISCII_HALANT
<< 8 | targetByteUnit
;
1048 /* write targetByteUnit to target */
1049 WRITE_TO_TARGET_FROM_U(args
, offsets
, source
, target
, targetLimit
, targetByteUnit
,err
);
1050 if (U_FAILURE(*err
)) {
1053 } else if (targetByteUnit
!= missingCharMarker
) {
1054 if (targetByteUnit
==ISCII_HALANT
) {
1055 converterData
->contextCharFromUnicode
= (UChar
)targetByteUnit
;
1057 /* write targetByteUnit to target*/
1058 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
1059 if (U_FAILURE(*err
)) {
1063 /* oops.. the code point is unassigned */
1064 /*check if the char is a First surrogate*/
1065 if (U16_IS_SURROGATE(sourceChar
)) {
1066 if (U16_IS_SURROGATE_LEAD(sourceChar
)) {
1068 /*look ahead to find the trail surrogate*/
1069 if (source
< sourceLimit
) {
1070 /* test the following code unit */
1071 UChar trail
= (*source
);
1072 if (U16_IS_TRAIL(trail
)) {
1074 sourceChar
=U16_GET_SUPPLEMENTARY(sourceChar
, trail
);
1075 *err
=U_INVALID_CHAR_FOUND
;
1076 /* convert this surrogate code point */
1077 /* exit this condition tree */
1079 /* this is an unmatched lead code unit (1st surrogate) */
1080 /* callback(illegal) */
1081 *err
=U_ILLEGAL_CHAR_FOUND
;
1085 *err
= U_ZERO_ERROR
;
1088 /* this is an unmatched trail code unit (2nd surrogate) */
1089 /* callback(illegal) */
1090 *err
=U_ILLEGAL_CHAR_FOUND
;
1093 /* callback(unassigned) for a BMP code point */
1094 *err
= U_INVALID_CHAR_FOUND
;
1097 args
->converter
->fromUChar32
=sourceChar
;
1100 }/* end while(mySourceIndex<mySourceLength) */
1102 /*save the state and return */
1103 args
->source
= source
;
1104 args
->target
= (char*)target
;
1107 static const uint16_t lookupTable
[][2]={
1108 { ZERO
, ZERO
}, /*DEFALT*/
1109 { ZERO
, ZERO
}, /*ROMAN*/
1110 { DEVANAGARI
, DEV_MASK
},
1111 { BENGALI
, BNG_MASK
},
1112 { TAMIL
, TML_MASK
},
1113 { TELUGU
, KND_MASK
},
1114 { BENGALI
, BNG_MASK
},
1115 { ORIYA
, ORI_MASK
},
1116 { KANNADA
, KND_MASK
},
1117 { MALAYALAM
, MLM_MASK
},
1118 { GUJARATI
, GJR_MASK
},
1119 { GURMUKHI
, PNJ_MASK
}
1122 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
1123 /* add offset to current Indic Block */ \
1124 if(targetUniChar>ASCII_END && \
1125 targetUniChar != ZWJ && \
1126 targetUniChar != ZWNJ && \
1127 targetUniChar != DANDA && \
1128 targetUniChar != DOUBLE_DANDA){ \
1130 targetUniChar+=(uint16_t)(delta); \
1132 /* now write the targetUniChar */ \
1133 if(target<args->targetLimit){ \
1134 *(target)++ = (UChar)targetUniChar; \
1136 *(offsets)++ = (int32_t)(offset); \
1139 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
1140 (UChar)targetUniChar; \
1141 *err = U_BUFFER_OVERFLOW_ERROR; \
1145 #define GET_MAPPING(sourceChar,targetUniChar,data){ \
1146 targetUniChar = toUnicodeTable[(sourceChar)] ; \
1147 /* is the code point valid in current script? */ \
1148 if(sourceChar> ASCII_END && \
1149 (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \
1150 /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
1151 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
1152 targetUniChar!=VOCALLIC_RR){ \
1153 targetUniChar=missingCharMarker; \
1159 * Rules for ISCII to Unicode converter
1160 * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
1161 * which has both precomposed and decomposed forms characters
1162 * pre-context and post-context need to be considered.
1165 * i) ATR : Attribute code is used to declare the font and script switching.
1166 * Currently we only switch scripts and font codes consumed without generating an error
1167 * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
1168 * obsolete characters
1170 * i) Halant: if preceeded by a halant then it is a explicit halant
1172 * a) if preceeded by a halant then it is a soft halant
1173 * b) if preceeded by specific consonants and the ligatures have pre-composed
1174 * characters in Unicode then convert to pre-composed characters
1175 * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
1179 static void U_CALLCONV
1180 UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
, UErrorCode
* err
) {
1181 const char *source
= ( char *) args
->source
;
1182 UChar
*target
= args
->target
;
1183 const char *sourceLimit
= args
->sourceLimit
;
1184 const UChar
* targetLimit
= args
->targetLimit
;
1185 uint32_t targetUniChar
= 0x0000;
1186 uint8_t sourceChar
= 0x0000;
1187 UConverterDataISCII
* data
;
1188 UChar32
* toUnicodeStatus
=NULL
;
1189 UChar32 tempTargetUniChar
= 0x0000;
1190 UChar
* contextCharToUnicode
= NULL
;
1195 if ((args
->converter
== NULL
) || (target
< args
->target
) || (source
< args
->source
)) {
1196 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1200 data
= (UConverterDataISCII
*)(args
->converter
->extraInfo
);
1201 contextCharToUnicode
= &data
->contextCharToUnicode
; /* contains previous ISCII codepoint visited */
1202 toUnicodeStatus
= (UChar32
*)&args
->converter
->toUnicodeStatus
;/* contains the mapping to Unicode of the above codepoint*/
1204 while (U_SUCCESS(*err
) && source
<sourceLimit
) {
1206 targetUniChar
= missingCharMarker
;
1208 if (target
< targetLimit
) {
1209 sourceChar
= (unsigned char)*(source
)++;
1211 /* look at the post-context preform special processing */
1212 if (*contextCharToUnicode
==ATR
) {
1214 /* If we have ATR in *contextCharToUnicode then we need to change our
1215 * state to the Indic Script specified by sourceChar
1218 /* check if the sourceChar is supported script range*/
1219 if ((uint8_t)(PNJ
-sourceChar
)<=PNJ
-DEV
) {
1220 data
->currentDeltaToUnicode
= (uint16_t)(lookupTable
[sourceChar
& 0x0F][0] * DELTA
);
1221 data
->currentMaskToUnicode
= (MaskEnum
)lookupTable
[sourceChar
& 0x0F][1];
1222 } else if (sourceChar
==DEF
) {
1223 /* switch back to default */
1224 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1225 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1227 if ((sourceChar
>= 0x21 && sourceChar
<= 0x3F)) {
1228 /* these are display codes consume and continue */
1230 *err
=U_ILLEGAL_CHAR_FOUND
;
1232 *contextCharToUnicode
=NO_CHAR_MARKER
;
1238 *contextCharToUnicode
=NO_CHAR_MARKER
;
1242 } else if (*contextCharToUnicode
==EXT
) {
1243 /* check if sourceChar is in 0xA1-0xEE range */
1244 if ((uint8_t) (EXT_RANGE_END
- sourceChar
) <= (EXT_RANGE_END
- EXT_RANGE_BEGIN
)) {
1245 /* We currently support only Anudatta and Devanagari abbreviation sign */
1246 if (sourceChar
==0xBF || sourceChar
== 0xB8) {
1247 targetUniChar
= (sourceChar
==0xBF) ? DEV_ABBR_SIGN
: DEV_ANUDATTA
;
1249 /* find out if the mapping is valid in this state */
1250 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1251 *contextCharToUnicode
= NO_CHAR_MARKER
;
1253 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1254 if (data
->prevToUnicodeStatus
) {
1255 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1256 data
->prevToUnicodeStatus
= 0x0000;
1258 /* write to target */
1259 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1264 /* byte unit is unassigned */
1265 targetUniChar
= missingCharMarker
;
1266 *err
= U_INVALID_CHAR_FOUND
;
1268 /* only 0xA1 - 0xEE are legal after EXT char */
1269 *contextCharToUnicode
= NO_CHAR_MARKER
;
1270 *err
= U_ILLEGAL_CHAR_FOUND
;
1273 } else if (*contextCharToUnicode
==ISCII_INV
) {
1274 if (sourceChar
==ISCII_HALANT
) {
1275 targetUniChar
= 0x0020; /* replace with space accoding to Indic FAQ */
1277 targetUniChar
= ZWJ
;
1280 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1281 if (data
->prevToUnicodeStatus
) {
1282 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1283 data
->prevToUnicodeStatus
= 0x0000;
1285 /* write to target */
1286 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1288 *contextCharToUnicode
=NO_CHAR_MARKER
;
1291 /* look at the pre-context and perform special processing */
1292 switch (sourceChar
) {
1296 *contextCharToUnicode
= (UChar
)sourceChar
;
1298 if (*toUnicodeStatus
!= missingCharMarker
) {
1299 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1300 if (data
->prevToUnicodeStatus
) {
1301 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1302 data
->prevToUnicodeStatus
= 0x0000;
1304 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1305 *toUnicodeStatus
= missingCharMarker
;
1309 /* handle double danda*/
1310 if (*contextCharToUnicode
== ISCII_DANDA
) {
1311 targetUniChar
= DOUBLE_DANDA
;
1312 /* clear the context */
1313 *contextCharToUnicode
= NO_CHAR_MARKER
;
1314 *toUnicodeStatus
= missingCharMarker
;
1316 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1317 *contextCharToUnicode
= sourceChar
;
1321 /* handle explicit halant */
1322 if (*contextCharToUnicode
== ISCII_HALANT
) {
1323 targetUniChar
= ZWNJ
;
1324 /* clear the context */
1325 *contextCharToUnicode
= NO_CHAR_MARKER
;
1327 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1328 *contextCharToUnicode
= sourceChar
;
1333 data
->resetToDefaultToUnicode
= TRUE
;
1334 GET_MAPPING(sourceChar
,targetUniChar
,data
)
1336 *contextCharToUnicode
= sourceChar
;
1339 case ISCII_VOWEL_SIGN_E
:
1342 for (; i
<vowelSignESpecialCases
[0][0]; i
++) {
1343 U_ASSERT(i
<UPRV_LENGTHOF(vowelSignESpecialCases
));
1344 if (vowelSignESpecialCases
[i
][0]==(uint8_t)*contextCharToUnicode
) {
1345 targetUniChar
=vowelSignESpecialCases
[i
][1];
1351 /* find out if the mapping is valid in this state */
1352 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1353 /*targetUniChar += data->currentDeltaToUnicode ;*/
1354 *contextCharToUnicode
= NO_CHAR_MARKER
;
1355 *toUnicodeStatus
= missingCharMarker
;
1359 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1360 *contextCharToUnicode
= sourceChar
;
1364 /* handle soft halant */
1365 if (*contextCharToUnicode
== ISCII_HALANT
) {
1366 targetUniChar
= ZWJ
;
1367 /* clear the context */
1368 *contextCharToUnicode
= NO_CHAR_MARKER
;
1370 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->contextCharToUnicode
== 0xc0) {
1371 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1372 if (data
->prevToUnicodeStatus
) {
1373 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1374 data
->prevToUnicodeStatus
= 0x0000;
1376 /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
1377 * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
1379 targetUniChar
= PNJ_RRA
;
1380 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1381 if (U_SUCCESS(*err
)) {
1382 targetUniChar
= PNJ_SIGN_VIRAMA
;
1383 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1384 if (U_SUCCESS(*err
)) {
1385 targetUniChar
= PNJ_HA
;
1386 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1388 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1391 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_SIGN_VIRAMA
;
1392 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1394 *toUnicodeStatus
= missingCharMarker
;
1395 data
->contextCharToUnicode
= NO_CHAR_MARKER
;
1398 /* try to handle <CHAR> + ISCII_NUKTA special mappings */
1401 for (; i
<nuktaSpecialCases
[0][0]; i
++) {
1402 if (nuktaSpecialCases
[i
][0]==(uint8_t)
1403 *contextCharToUnicode
) {
1404 targetUniChar
=nuktaSpecialCases
[i
][1];
1410 /* find out if the mapping is valid in this state */
1411 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1412 /*targetUniChar += data->currentDeltaToUnicode ;*/
1413 *contextCharToUnicode
= NO_CHAR_MARKER
;
1414 *toUnicodeStatus
= missingCharMarker
;
1415 if (data
->currentDeltaToUnicode
== PNJ_DELTA
) {
1416 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1417 if (data
->prevToUnicodeStatus
) {
1418 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1419 data
->prevToUnicodeStatus
= 0x0000;
1421 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1426 /* else fall through to default */
1428 /* else fall through to default */
1431 default:GET_MAPPING(sourceChar
,targetUniChar
,data
)
1433 *contextCharToUnicode
= sourceChar
;
1437 if (*toUnicodeStatus
!= missingCharMarker
) {
1438 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
1439 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->prevToUnicodeStatus
!= 0 && isPNJConsonant(data
->prevToUnicodeStatus
) &&
1440 (*toUnicodeStatus
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& ((UChar32
)(targetUniChar
+ PNJ_DELTA
) == data
->prevToUnicodeStatus
)) {
1441 /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
1442 offset
= (int)(source
-args
->source
- 3);
1443 tempTargetUniChar
= PNJ_ADHAK
; /* This is necessary to avoid some compiler warnings. */
1444 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,tempTargetUniChar
,0,err
);
1445 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,data
->prevToUnicodeStatus
,0,err
);
1446 data
->prevToUnicodeStatus
= 0x0000; /* reset the previous unicode code point */
1447 *toUnicodeStatus
= missingCharMarker
;
1450 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1451 if (data
->prevToUnicodeStatus
) {
1452 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1453 data
->prevToUnicodeStatus
= 0x0000;
1455 /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
1456 * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
1458 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_BINDI
&& isPNJBindiTippi((*toUnicodeStatus
+ PNJ_DELTA
))) {
1459 targetUniChar
= PNJ_TIPPI
- PNJ_DELTA
;
1460 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,PNJ_DELTA
,err
);
1461 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& isPNJConsonant((*toUnicodeStatus
+ PNJ_DELTA
))) {
1462 /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
1463 data
->prevToUnicodeStatus
= *toUnicodeStatus
+ PNJ_DELTA
;
1465 /* write the previously mapped codepoint */
1466 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1469 *toUnicodeStatus
= missingCharMarker
;
1472 if (targetUniChar
!= missingCharMarker
) {
1473 /* now save the targetUniChar for delayed write */
1474 *toUnicodeStatus
= (UChar
) targetUniChar
;
1475 if (data
->resetToDefaultToUnicode
==TRUE
) {
1476 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1477 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1478 data
->resetToDefaultToUnicode
=FALSE
;
1482 /* we reach here only if targetUniChar == missingCharMarker
1483 * so assign codes to reason and err
1485 *err
= U_INVALID_CHAR_FOUND
;
1487 args
->converter
->toUBytes
[0] = (uint8_t) sourceChar
;
1488 args
->converter
->toULength
= 1;
1493 *err
=U_BUFFER_OVERFLOW_ERROR
;
1498 if (U_SUCCESS(*err
) && args
->flush
&& source
== sourceLimit
) {
1499 /* end of the input stream */
1500 UConverter
*cnv
= args
->converter
;
1502 if (*contextCharToUnicode
==ATR
|| *contextCharToUnicode
==EXT
|| *contextCharToUnicode
==ISCII_INV
) {
1503 /* set toUBytes[] */
1504 cnv
->toUBytes
[0] = (uint8_t)*contextCharToUnicode
;
1507 /* avoid looping on truncated sequences */
1508 *contextCharToUnicode
= NO_CHAR_MARKER
;
1513 if (*toUnicodeStatus
!= missingCharMarker
) {
1514 /* output a remaining target character */
1515 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
- args
->source
-1),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1516 *toUnicodeStatus
= missingCharMarker
;
1520 args
->target
= target
;
1521 args
->source
= source
;
1524 /* structure for SafeClone calculations */
1525 struct cloneISCIIStruct
{
1527 UConverterDataISCII mydata
;
1530 static UConverter
* U_CALLCONV
1531 _ISCII_SafeClone(const UConverter
*cnv
,
1533 int32_t *pBufferSize
,
1536 struct cloneISCIIStruct
* localClone
;
1537 int32_t bufferSizeNeeded
= sizeof(struct cloneISCIIStruct
);
1539 if (U_FAILURE(*status
)) {
1543 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
1544 *pBufferSize
= bufferSizeNeeded
;
1548 localClone
= (struct cloneISCIIStruct
*)stackBuffer
;
1549 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1551 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataISCII
));
1552 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1553 localClone
->cnv
.isExtraLocal
= TRUE
;
1555 return &localClone
->cnv
;
1558 static void U_CALLCONV
1559 _ISCIIGetUnicodeSet(const UConverter
*cnv
,
1560 const USetAdder
*sa
,
1561 UConverterUnicodeSet which
,
1562 UErrorCode
*pErrorCode
)
1567 int32_t idx
, script
;
1570 /* Since all ISCII versions allow switching to other ISCII
1571 scripts, we add all roundtrippable characters to this set. */
1572 sa
->addRange(sa
->set
, 0, ASCII_END
);
1573 for (script
= DEVANAGARI
; script
<= MALAYALAM
; script
++) {
1574 mask
= (uint8_t)(lookupInitialData
[script
].maskEnum
);
1575 for (idx
= 0; idx
< DELTA
; idx
++) {
1576 /* added check for TELUGU character */
1577 if ((validityTable
[idx
] & mask
) || (script
==TELUGU
&& idx
==0x31)) {
1578 sa
->add(sa
->set
, idx
+ (script
* DELTA
) + INDIC_BLOCK_BEGIN
);
1582 sa
->add(sa
->set
, DANDA
);
1583 sa
->add(sa
->set
, DOUBLE_DANDA
);
1584 sa
->add(sa
->set
, ZWNJ
);
1585 sa
->add(sa
->set
, ZWJ
);
1588 static const UConverterImpl _ISCIIImpl
={
1599 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1600 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1601 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1602 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1609 _ISCIIGetUnicodeSet
,
1614 static const UConverterStaticData _ISCIIStaticData
={
1615 sizeof(UConverterStaticData
),
1628 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
1632 const UConverterSharedData _ISCIIData
=
1633 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData
, &_ISCIIImpl
);
1635 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */