2 **********************************************************************
3 * Copyright (C) 2000-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvisci.c
8 * tab size: 8 (not used)
11 * created on: 2001JUN26
12 * created by: Ram Viswanadha
14 * Date Name Description
15 * 24/7/2001 Ram Added support for EXT character handling
18 #include "unicode/utypes.h"
20 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
24 #include "unicode/ucnv.h"
26 #include "unicode/ucnv_cb.h"
27 #include "unicode/uset.h"
30 #define UCNV_OPTIONS_VERSION_MASK 0xf
33 #define ZWNJ 0x200c /* Zero Width Non Joiner */
34 #define ZWJ 0x200d /* Zero width Joiner */
35 #define INVALID_CHAR 0xffff
36 #define ATR 0xEF /* Attribute code */
37 #define EXT 0xF0 /* Extension code */
39 #define DOUBLE_DANDA 0x0965
40 #define ISCII_NUKTA 0xE9
41 #define ISCII_HALANT 0xE8
42 #define ISCII_DANDA 0xEA
43 #define ISCII_INV 0xD9
44 #define ISCII_VOWEL_SIGN_E 0xE0
45 #define INDIC_BLOCK_BEGIN 0x0900
46 #define INDIC_BLOCK_END 0x0D7F
47 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
48 #define VOCALLIC_RR 0x0931
50 #define ASCII_END 0xA0
51 #define NO_CHAR_MARKER 0xFFFE
52 #define TELUGU_DELTA DELTA * TELUGU
53 #define DEV_ABBR_SIGN 0x0970
54 #define DEV_ANUDATTA 0x0952
55 #define EXT_RANGE_BEGIN 0xA1
56 #define EXT_RANGE_END 0xEE
58 #define PNJ_DELTA 0x0100
59 #define PNJ_BINDI 0x0A02
60 #define PNJ_TIPPI 0x0A70
61 #define PNJ_SIGN_VIRAMA 0x0A4D
62 #define PNJ_ADHAK 0x0A71
64 #define PNJ_RRA 0x0A5C
66 static USet
* PNJ_BINDI_TIPPI_SET
= NULL
;
67 static USet
* PNJ_CONSONANT_SET
= NULL
;
83 * Enumeration for switching code pages if <ATR>+<one of below values>
119 #define ISCII_CNV_PREFIX "ISCII,version="
122 UChar contextCharToUnicode
; /* previous Unicode codepoint for contextual analysis */
123 UChar contextCharFromUnicode
; /* previous Unicode codepoint for contextual analysis */
124 uint16_t defDeltaToUnicode
; /* delta for switching to default state when DEF is encountered */
125 uint16_t currentDeltaFromUnicode
; /* current delta in Indic block */
126 uint16_t currentDeltaToUnicode
; /* current delta in Indic block */
127 MaskEnum currentMaskFromUnicode
; /* mask for current state in toUnicode */
128 MaskEnum currentMaskToUnicode
; /* mask for current state in toUnicode */
129 MaskEnum defMaskToUnicode
; /* mask for default state in toUnicode */
130 UBool isFirstBuffer
; /* boolean for fromUnicode to see if we need to announce the first script */
131 UBool resetToDefaultToUnicode
; /* boolean for reseting to default delta and mask when a newline is encountered*/
132 char name
[sizeof(ISCII_CNV_PREFIX
) + 1];
133 UChar32 prevToUnicodeStatus
; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
134 } UConverterDataISCII
;
136 typedef struct LookupDataStruct
{
142 static const LookupDataStruct lookupInitialData
[]={
143 { DEVANAGARI
, DEV_MASK
, DEV
},
144 { BENGALI
, BNG_MASK
, BNG
},
145 { GURMUKHI
, PNJ_MASK
, PNJ
},
146 { GUJARATI
, GJR_MASK
, GJR
},
147 { ORIYA
, ORI_MASK
, ORI
},
148 { TAMIL
, TML_MASK
, TML
},
149 { TELUGU
, KND_MASK
, TLG
},
150 { KANNADA
, KND_MASK
, KND
},
151 { MALAYALAM
, MLM_MASK
, MLM
}
154 static void initializeSets() {
155 /* TODO: Replace the following two lines with PNJ_CONSONANT_SET = uset_openEmpty(); */
156 PNJ_CONSONANT_SET
= uset_open(0,0);
157 uset_clear(PNJ_CONSONANT_SET
);
159 uset_addRange(PNJ_CONSONANT_SET
, 0x0A15, 0x0A28);
160 uset_addRange(PNJ_CONSONANT_SET
, 0x0A2A, 0x0A30);
161 uset_addRange(PNJ_CONSONANT_SET
, 0x0A35, 0x0A36);
162 uset_addRange(PNJ_CONSONANT_SET
, 0x0A38, 0x0A39);
164 PNJ_BINDI_TIPPI_SET
= uset_clone(PNJ_CONSONANT_SET
);
165 uset_add(PNJ_BINDI_TIPPI_SET
, 0x0A05);
166 uset_add(PNJ_BINDI_TIPPI_SET
, 0x0A07);
167 uset_add(PNJ_BINDI_TIPPI_SET
, 0x0A3F);
168 uset_addRange(PNJ_BINDI_TIPPI_SET
, 0x0A41, 0x0A42);
170 uset_compact(PNJ_CONSONANT_SET
);
171 uset_compact(PNJ_BINDI_TIPPI_SET
);
174 static void _ISCIIOpen(UConverter
*cnv
, UConverterLoadArgs
*pArgs
, UErrorCode
*errorCode
) {
175 if(pArgs
->onlyTestIsLoadable
) {
179 /* Ensure that the sets used in special handling of certain Gurmukhi characters are initialized. */
182 cnv
->extraInfo
= uprv_malloc(sizeof(UConverterDataISCII
));
184 if (cnv
->extraInfo
!= NULL
) {
186 UConverterDataISCII
*converterData
=
187 (UConverterDataISCII
*) cnv
->extraInfo
;
188 converterData
->contextCharToUnicode
=NO_CHAR_MARKER
;
189 cnv
->toUnicodeStatus
= missingCharMarker
;
190 converterData
->contextCharFromUnicode
=0x0000;
191 converterData
->resetToDefaultToUnicode
=FALSE
;
192 /* check if the version requested is supported */
193 if ((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) < 9) {
194 /* initialize state variables */
195 converterData
->currentDeltaFromUnicode
196 = converterData
->currentDeltaToUnicode
197 = converterData
->defDeltaToUnicode
= (uint16_t)(lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].uniLang
* DELTA
);
199 converterData
->currentMaskFromUnicode
200 = converterData
->currentMaskToUnicode
201 = converterData
->defMaskToUnicode
= lookupInitialData
[pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
].maskEnum
;
203 converterData
->isFirstBuffer
=TRUE
;
204 (void)uprv_strcpy(converterData
->name
, ISCII_CNV_PREFIX
);
205 len
= (int32_t)uprv_strlen(converterData
->name
);
206 converterData
->name
[len
]= (char)((pArgs
->options
& UCNV_OPTIONS_VERSION_MASK
) + '0');
207 converterData
->name
[len
+1]=0;
209 converterData
->prevToUnicodeStatus
= 0x0000;
211 uprv_free(cnv
->extraInfo
);
212 cnv
->extraInfo
= NULL
;
213 *errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
217 *errorCode
=U_MEMORY_ALLOCATION_ERROR
;
221 static void _ISCIIClose(UConverter
*cnv
) {
222 if (cnv
->extraInfo
!=NULL
) {
223 if (!cnv
->isExtraLocal
) {
224 uprv_free(cnv
->extraInfo
);
228 if (PNJ_CONSONANT_SET
!= NULL
) {
229 uset_close(PNJ_CONSONANT_SET
);
230 PNJ_CONSONANT_SET
= NULL
;
232 if (PNJ_BINDI_TIPPI_SET
!= NULL
) {
233 uset_close(PNJ_BINDI_TIPPI_SET
);
234 PNJ_BINDI_TIPPI_SET
= NULL
;
238 static const char* _ISCIIgetName(const UConverter
* cnv
) {
239 if (cnv
->extraInfo
) {
240 UConverterDataISCII
* myData
= (UConverterDataISCII
*)cnv
->extraInfo
;
246 static void _ISCIIReset(UConverter
*cnv
, UConverterResetChoice choice
) {
247 UConverterDataISCII
* data
=(UConverterDataISCII
*) (cnv
->extraInfo
);
248 if (choice
<=UCNV_RESET_TO_UNICODE
) {
249 cnv
->toUnicodeStatus
= missingCharMarker
;
251 data
->currentDeltaToUnicode
=data
->defDeltaToUnicode
;
252 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
253 data
->contextCharToUnicode
=NO_CHAR_MARKER
;
254 data
->prevToUnicodeStatus
= 0x0000;
256 if (choice
!=UCNV_RESET_TO_UNICODE
) {
257 cnv
->fromUChar32
=0x0000;
258 data
->contextCharFromUnicode
=0x00;
259 data
->currentMaskFromUnicode
=data
->defMaskToUnicode
;
260 data
->currentDeltaFromUnicode
=data
->defDeltaToUnicode
;
261 data
->isFirstBuffer
=TRUE
;
262 data
->resetToDefaultToUnicode
=FALSE
;
267 * The values in validity table are indexed by the lower bits of Unicode
268 * range 0x0900 - 0x09ff. The values have a structure like:
269 * ---------------------------------------------------------------
270 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
271 * | | | | | ASM | KND | | |
272 * ---------------------------------------------------------------
273 * If a code point is valid in a particular script
274 * then that bit is turned on
276 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
277 * to represent these languages
279 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
280 * and combine and use 1 bit to represent these languages.
282 * TODO: It is probably easier to understand and maintain to change this
283 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
286 static const uint8_t validityTable
[128] = {
287 /* This state table is tool generated please do not edit unless you know exactly what you are doing */
288 /* Note: This table was edited to mirror the Windows XP implementation */
289 /*ISCII:Valid:Unicode */
290 /*0xa0 : 0x00: 0x900 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
291 /*0xa1 : 0xb8: 0x901 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
292 /*0xa2 : 0xfe: 0x902 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
293 /*0xa3 : 0xbf: 0x903 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
294 /*0x00 : 0x00: 0x904 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
295 /*0xa4 : 0xff: 0x905 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
296 /*0xa5 : 0xff: 0x906 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
297 /*0xa6 : 0xff: 0x907 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
298 /*0xa7 : 0xff: 0x908 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
299 /*0xa8 : 0xff: 0x909 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
300 /*0xa9 : 0xff: 0x90a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
301 /*0xaa : 0xfe: 0x90b */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
302 /*0x00 : 0x00: 0x90c */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
303 /*0xae : 0x80: 0x90d */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
304 /*0xab : 0x87: 0x90e */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
305 /*0xac : 0xff: 0x90f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
306 /*0xad : 0xff: 0x910 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
307 /*0xb2 : 0x80: 0x911 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
308 /*0xaf : 0x87: 0x912 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
309 /*0xb0 : 0xff: 0x913 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
310 /*0xb1 : 0xff: 0x914 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
311 /*0xb3 : 0xff: 0x915 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
312 /*0xb4 : 0xfe: 0x916 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
313 /*0xb5 : 0xfe: 0x917 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
314 /*0xb6 : 0xfe: 0x918 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
315 /*0xb7 : 0xff: 0x919 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
316 /*0xb8 : 0xff: 0x91a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
317 /*0xb9 : 0xfe: 0x91b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
318 /*0xba : 0xff: 0x91c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
319 /*0xbb : 0xfe: 0x91d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
320 /*0xbc : 0xff: 0x91e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
321 /*0xbd : 0xff: 0x91f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
322 /*0xbe : 0xfe: 0x920 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
323 /*0xbf : 0xfe: 0x921 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
324 /*0xc0 : 0xfe: 0x922 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
325 /*0xc1 : 0xff: 0x923 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
326 /*0xc2 : 0xff: 0x924 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
327 /*0xc3 : 0xfe: 0x925 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
328 /*0xc4 : 0xfe: 0x926 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
329 /*0xc5 : 0xfe: 0x927 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
330 /*0xc6 : 0xff: 0x928 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
331 /*0xc7 : 0x81: 0x929 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ TML_MASK
,
332 /*0xc8 : 0xff: 0x92a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
333 /*0xc9 : 0xfe: 0x92b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
334 /*0xca : 0xfe: 0x92c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
335 /*0xcb : 0xfe: 0x92d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
336 /*0xcc : 0xfe: 0x92e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
337 /*0xcd : 0xff: 0x92f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
338 /*0xcf : 0xff: 0x930 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
339 /*0xd0 : 0x87: 0x931 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
340 /*0xd1 : 0xff: 0x932 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
341 /*0xd2 : 0xb7: 0x933 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
342 /*0xd3 : 0x83: 0x934 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
343 /*0xd4 : 0xff: 0x935 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
344 /*0xd5 : 0xfe: 0x936 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
345 /*0xd6 : 0xbf: 0x937 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
346 /*0xd7 : 0xff: 0x938 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
347 /*0xd8 : 0xff: 0x939 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
348 /*0x00 : 0x00: 0x93A */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
349 /*0x00 : 0x00: 0x93B */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
350 /*0xe9 : 0xda: 0x93c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
351 /*0x00 : 0x00: 0x93d */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
352 /*0xda : 0xff: 0x93e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
353 /*0xdb : 0xff: 0x93f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
354 /*0xdc : 0xff: 0x940 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
355 /*0xdd : 0xff: 0x941 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
356 /*0xde : 0xff: 0x942 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
357 /*0xdf : 0xbe: 0x943 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
358 /*0x00 : 0x00: 0x944 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ BNG_MASK
+ KND_MASK
+ ZERO
+ ZERO
,
359 /*0xe3 : 0x80: 0x945 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
360 /*0xe0 : 0x87: 0x946 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
361 /*0xe1 : 0xff: 0x947 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
362 /*0xe2 : 0xff: 0x948 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
363 /*0xe7 : 0x80: 0x949 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
364 /*0xe4 : 0x87: 0x94a */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
365 /*0xe5 : 0xff: 0x94b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
366 /*0xe6 : 0xff: 0x94c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
367 /*0xe8 : 0xff: 0x94d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
368 /*0xec : 0x00: 0x94e */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
369 /*0xed : 0x00: 0x94f */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
370 /*0x00 : 0x00: 0x950 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
371 /*0x00 : 0x00: 0x951 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
372 /*0x00 : 0x00: 0x952 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
373 /*0x00 : 0x00: 0x953 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
374 /*0x00 : 0x00: 0x954 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
375 /*0x00 : 0x00: 0x955 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
376 /*0x00 : 0x00: 0x956 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
377 /*0x00 : 0x00: 0x957 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ MLM_MASK
+ ZERO
,
378 /*0x00 : 0x00: 0x958 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
379 /*0x00 : 0x00: 0x959 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
380 /*0x00 : 0x00: 0x95a */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
381 /*0x00 : 0x00: 0x95b */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
382 /*0x00 : 0x00: 0x95c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
383 /*0x00 : 0x00: 0x95d */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
384 /*0x00 : 0x00: 0x95e */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
385 /*0xce : 0x98: 0x95f */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
386 /*0x00 : 0x00: 0x960 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
387 /*0x00 : 0x00: 0x961 */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
388 /*0x00 : 0x00: 0x962 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
389 /*0x00 : 0x00: 0x963 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
390 /*0xea : 0xf8: 0x964 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
391 /*0xeaea : 0x00: 0x965*/ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
392 /*0xf1 : 0xff: 0x966 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
393 /*0xf2 : 0xff: 0x967 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
394 /*0xf3 : 0xff: 0x968 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
395 /*0xf4 : 0xff: 0x969 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
396 /*0xf5 : 0xff: 0x96a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
397 /*0xf6 : 0xff: 0x96b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
398 /*0xf7 : 0xff: 0x96c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
399 /*0xf8 : 0xff: 0x96d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
400 /*0xf9 : 0xff: 0x96e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
401 /*0xfa : 0xff: 0x96f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
402 /*0x00 : 0x80: 0x970 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
404 * The length of the array is 128 to provide values for 0x900..0x97f.
405 * The last 15 entries for 0x971..0x97f of the validity table are all zero
406 * because no Indic script uses such Unicode code points.
408 /*0x00 : 0x00: 0x9yz */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
411 static const uint16_t fromUnicodeTable
[128]={
492 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
541 static const uint16_t toUnicodeTable
[256]={
800 static const uint16_t vowelSignESpecialCases
[][2]={
801 { 2 /*length of array*/ , 0 },
805 static const uint16_t nuktaSpecialCases
[][2]={
806 { 16 /*length of array*/ , 0 },
825 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
826 int32_t offset = (int32_t)(source - args->source-1); \
827 /* write the targetUniChar to target */ \
828 if(target < targetLimit){ \
829 if(targetByteUnit <= 0xFF){ \
830 *(target)++ = (uint8_t)(targetByteUnit); \
832 *(offsets++) = offset; \
835 if (targetByteUnit > 0xFFFF) { \
836 *(target)++ = (uint8_t)(targetByteUnit>>16); \
839 *(offsets++) = offset; \
842 if (!(target < targetLimit)) { \
843 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
844 (uint8_t)(targetByteUnit >> 8); \
845 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
846 (uint8_t)targetByteUnit; \
847 *err = U_BUFFER_OVERFLOW_ERROR; \
849 *(target)++ = (uint8_t)(targetByteUnit>>8); \
851 *(offsets++) = offset; \
853 if(target < targetLimit){ \
854 *(target)++ = (uint8_t) targetByteUnit; \
856 *(offsets++) = offset ; \
859 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
860 (uint8_t) (targetByteUnit); \
861 *err = U_BUFFER_OVERFLOW_ERROR; \
866 if (targetByteUnit & 0xFF0000) { \
867 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
868 (uint8_t) (targetByteUnit >>16); \
870 if(targetByteUnit & 0xFF00){ \
871 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
872 (uint8_t) (targetByteUnit >>8); \
874 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
875 (uint8_t) (targetByteUnit); \
876 *err = U_BUFFER_OVERFLOW_ERROR; \
887 static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
888 UConverterFromUnicodeArgs
* args
, UErrorCode
* err
) {
889 const UChar
*source
= args
->source
;
890 const UChar
*sourceLimit
= args
->sourceLimit
;
891 unsigned char *target
= (unsigned char *) args
->target
;
892 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
893 int32_t* offsets
= args
->offsets
;
894 uint32_t targetByteUnit
= 0x0000;
895 UChar32 sourceChar
= 0x0000;
896 UChar32 tempContextFromUnicode
= 0x0000; /* For special handling of the Gurmukhi script. */
897 UConverterDataISCII
*converterData
;
900 UBool deltaChanged
= FALSE
;
902 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)) {
903 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
906 /* initialize data */
907 converterData
=(UConverterDataISCII
*)args
->converter
->extraInfo
;
908 newDelta
=converterData
->currentDeltaFromUnicode
;
909 range
= (uint16_t)(newDelta
/DELTA
);
911 if ((sourceChar
= args
->converter
->fromUChar32
)!=0) {
915 /*writing the char to the output stream */
916 while (source
< sourceLimit
) {
917 /* Write the language code following LF only if LF is not the last character. */
918 if (args
->converter
->fromUnicodeStatus
== LF
) {
919 targetByteUnit
= ATR
<<8;
920 targetByteUnit
+= (uint8_t) lookupInitialData
[range
].isciiLang
;
921 args
->converter
->fromUnicodeStatus
= 0x0000;
922 /* now append ATR and language code */
923 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
924 if (U_FAILURE(*err
)) {
929 sourceChar
= *source
++;
930 tempContextFromUnicode
= converterData
->contextCharFromUnicode
;
932 targetByteUnit
= missingCharMarker
;
934 /*check if input is in ASCII and C0 control codes range*/
935 if (sourceChar
<= ASCII_END
) {
936 args
->converter
->fromUnicodeStatus
= sourceChar
;
937 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,sourceChar
,err
);
938 if (U_FAILURE(*err
)) {
943 switch (sourceChar
) {
945 /* contextChar has HALANT */
946 if (converterData
->contextCharFromUnicode
) {
947 converterData
->contextCharFromUnicode
= 0x00;
948 targetByteUnit
= ISCII_HALANT
;
950 /* consume ZWNJ and continue */
951 converterData
->contextCharFromUnicode
= 0x00;
956 /* contextChar has HALANT */
957 if (converterData
->contextCharFromUnicode
) {
958 targetByteUnit
= ISCII_NUKTA
;
960 targetByteUnit
=ISCII_INV
;
962 converterData
->contextCharFromUnicode
= 0x00;
965 /* is the sourceChar in the INDIC_RANGE? */
966 if ((uint16_t)(INDIC_BLOCK_END
-sourceChar
) <= INDIC_RANGE
) {
967 /* Danda and Double Danda are valid in Northern scripts.. since Unicode
968 * does not include these codepoints in all Northern scrips we need to
971 if (sourceChar
!= DANDA
&& sourceChar
!= DOUBLE_DANDA
) {
972 /* find out to which block the souceChar belongs*/
973 range
=(uint16_t)((sourceChar
-INDIC_BLOCK_BEGIN
)/DELTA
);
974 newDelta
=(uint16_t)(range
*DELTA
);
976 /* Now are we in the same block as the previous? */
977 if (newDelta
!= converterData
->currentDeltaFromUnicode
|| converterData
->isFirstBuffer
) {
978 converterData
->currentDeltaFromUnicode
= newDelta
;
979 converterData
->currentMaskFromUnicode
= lookupInitialData
[range
].maskEnum
;
981 converterData
->isFirstBuffer
=FALSE
;
984 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
) {
985 if (sourceChar
== PNJ_TIPPI
) {
986 /* Make sure Tippi is converterd to Bindi. */
987 sourceChar
= PNJ_BINDI
;
988 } else if (sourceChar
== PNJ_ADHAK
) {
989 /* This is for consonant cluster handling. */
990 converterData
->contextCharFromUnicode
= PNJ_ADHAK
;
994 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
995 /* now subtract the new delta from sourceChar*/
996 sourceChar
-= converterData
->currentDeltaFromUnicode
;
999 /* get the target byte unit */
1000 targetByteUnit
=fromUnicodeTable
[(uint8_t)sourceChar
];
1002 /* is the code point valid in current script? */
1003 if ((validityTable
[(uint8_t)sourceChar
] & converterData
->currentMaskFromUnicode
)==0) {
1004 /* Vocallic RR is assigned in ISCII Telugu and Unicode */
1005 if (converterData
->currentDeltaFromUnicode
!=(TELUGU_DELTA
) || sourceChar
!=VOCALLIC_RR
) {
1006 targetByteUnit
=missingCharMarker
;
1011 /* we are in a script block which is different than
1012 * previous sourceChar's script block write ATR and language codes
1015 temp
=(uint16_t)(ATR
<<8);
1016 temp
+= (uint16_t)((uint8_t) lookupInitialData
[range
].isciiLang
);
1019 /* now append ATR and language code */
1020 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,temp
,err
);
1021 if (U_FAILURE(*err
)) {
1026 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& (sourceChar
+ PNJ_DELTA
) == PNJ_ADHAK
) {
1030 /* reset context char */
1031 converterData
->contextCharFromUnicode
= 0x00;
1034 if (converterData
->currentDeltaFromUnicode
== PNJ_DELTA
&& tempContextFromUnicode
== PNJ_ADHAK
&& uset_contains(PNJ_CONSONANT_SET
, (sourceChar
+ PNJ_DELTA
))) {
1035 /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
1036 /* reset context char */
1037 converterData
->contextCharFromUnicode
= 0x0000;
1038 targetByteUnit
= targetByteUnit
<< 16 | ISCII_HALANT
<< 8 | targetByteUnit
;
1039 /* write targetByteUnit to target */
1040 WRITE_TO_TARGET_FROM_U(args
, offsets
, source
, target
, targetLimit
, targetByteUnit
,err
);
1041 if (U_FAILURE(*err
)) {
1044 } else if (targetByteUnit
!= missingCharMarker
) {
1045 if (targetByteUnit
==ISCII_HALANT
) {
1046 converterData
->contextCharFromUnicode
= (UChar
)targetByteUnit
;
1048 /* write targetByteUnit to target*/
1049 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
1050 if (U_FAILURE(*err
)) {
1054 /* oops.. the code point is unassigned */
1055 /*check if the char is a First surrogate*/
1056 if (UTF_IS_SURROGATE(sourceChar
)) {
1057 if (UTF_IS_SURROGATE_FIRST(sourceChar
)) {
1059 /*look ahead to find the trail surrogate*/
1060 if (source
< sourceLimit
) {
1061 /* test the following code unit */
1062 UChar trail
= (*source
);
1063 if (UTF_IS_SECOND_SURROGATE(trail
)) {
1065 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
1066 *err
=U_INVALID_CHAR_FOUND
;
1067 /* convert this surrogate code point */
1068 /* exit this condition tree */
1070 /* this is an unmatched lead code unit (1st surrogate) */
1071 /* callback(illegal) */
1072 *err
=U_ILLEGAL_CHAR_FOUND
;
1076 *err
= U_ZERO_ERROR
;
1079 /* this is an unmatched trail code unit (2nd surrogate) */
1080 /* callback(illegal) */
1081 *err
=U_ILLEGAL_CHAR_FOUND
;
1084 /* callback(unassigned) for a BMP code point */
1085 *err
= U_INVALID_CHAR_FOUND
;
1088 args
->converter
->fromUChar32
=sourceChar
;
1091 }/* end while(mySourceIndex<mySourceLength) */
1093 /*save the state and return */
1094 args
->source
= source
;
1095 args
->target
= (char*)target
;
1098 static const uint16_t lookupTable
[][2]={
1099 { ZERO
, ZERO
}, /*DEFALT*/
1100 { ZERO
, ZERO
}, /*ROMAN*/
1101 { DEVANAGARI
, DEV_MASK
},
1102 { BENGALI
, BNG_MASK
},
1103 { TAMIL
, TML_MASK
},
1104 { TELUGU
, KND_MASK
},
1105 { BENGALI
, BNG_MASK
},
1106 { ORIYA
, ORI_MASK
},
1107 { KANNADA
, KND_MASK
},
1108 { MALAYALAM
, MLM_MASK
},
1109 { GUJARATI
, GJR_MASK
},
1110 { GURMUKHI
, PNJ_MASK
}
1113 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
1114 /* add offset to current Indic Block */ \
1115 if(targetUniChar>ASCII_END && \
1116 targetUniChar != ZWJ && \
1117 targetUniChar != ZWNJ && \
1118 targetUniChar != DANDA && \
1119 targetUniChar != DOUBLE_DANDA){ \
1121 targetUniChar+=(uint16_t)(delta); \
1123 /* now write the targetUniChar */ \
1124 if(target<args->targetLimit){ \
1125 *(target)++ = (UChar)targetUniChar; \
1127 *(offsets)++ = (int32_t)(offset); \
1130 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
1131 (UChar)targetUniChar; \
1132 *err = U_BUFFER_OVERFLOW_ERROR; \
1136 #define GET_MAPPING(sourceChar,targetUniChar,data){ \
1137 targetUniChar = toUnicodeTable[(sourceChar)] ; \
1138 /* is the code point valid in current script? */ \
1139 if(sourceChar> ASCII_END && \
1140 (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){ \
1141 /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
1142 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
1143 targetUniChar!=VOCALLIC_RR){ \
1144 targetUniChar=missingCharMarker; \
1150 * Rules for ISCII to Unicode converter
1151 * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
1152 * which has both precomposed and decomposed forms characters
1153 * pre-context and post-context need to be considered.
1156 * i) ATR : Attribute code is used to declare the font and script switching.
1157 * Currently we only switch scripts and font codes consumed without generating an error
1158 * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
1159 * obsolete characters
1161 * i) Halant: if preceeded by a halant then it is a explicit halant
1163 * a) if preceeded by a halant then it is a soft halant
1164 * b) if preceeded by specific consonants and the ligatures have pre-composed
1165 * characters in Unicode then convert to pre-composed characters
1166 * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
1170 static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
, UErrorCode
* err
) {
1171 const char *source
= ( char *) args
->source
;
1172 UChar
*target
= args
->target
;
1173 const char *sourceLimit
= args
->sourceLimit
;
1174 const UChar
* targetLimit
= args
->targetLimit
;
1175 uint32_t targetUniChar
= 0x0000;
1176 uint8_t sourceChar
= 0x0000;
1177 UConverterDataISCII
* data
;
1178 UChar32
* toUnicodeStatus
=NULL
;
1179 UChar32 tempTargetUniChar
= 0x0000;
1180 UChar
* contextCharToUnicode
= NULL
;
1185 if ((args
->converter
== NULL
) || (target
< args
->target
) || (source
< args
->source
)) {
1186 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1190 data
= (UConverterDataISCII
*)(args
->converter
->extraInfo
);
1191 contextCharToUnicode
= &data
->contextCharToUnicode
; /* contains previous ISCII codepoint visited */
1192 toUnicodeStatus
= (UChar32
*)&args
->converter
->toUnicodeStatus
;/* contains the mapping to Unicode of the above codepoint*/
1194 while (U_SUCCESS(*err
) && source
<sourceLimit
) {
1196 targetUniChar
= missingCharMarker
;
1198 if (target
< targetLimit
) {
1199 sourceChar
= (unsigned char)*(source
)++;
1201 /* look at the post-context preform special processing */
1202 if (*contextCharToUnicode
==ATR
) {
1204 /* If we have ATR in *contextCharToUnicode then we need to change our
1205 * state to the Indic Script specified by sourceChar
1208 /* check if the sourceChar is supported script range*/
1209 if ((uint8_t)(PNJ
-sourceChar
)<=PNJ
-DEV
) {
1210 data
->currentDeltaToUnicode
= (uint16_t)(lookupTable
[sourceChar
& 0x0F][0] * DELTA
);
1211 data
->currentMaskToUnicode
= (MaskEnum
)lookupTable
[sourceChar
& 0x0F][1];
1212 } else if (sourceChar
==DEF
) {
1213 /* switch back to default */
1214 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1215 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1217 if ((sourceChar
>= 0x21 && sourceChar
<= 0x3F)) {
1218 /* these are display codes consume and continue */
1220 *err
=U_ILLEGAL_CHAR_FOUND
;
1222 *contextCharToUnicode
=NO_CHAR_MARKER
;
1228 *contextCharToUnicode
=NO_CHAR_MARKER
;
1232 } else if (*contextCharToUnicode
==EXT
) {
1233 /* check if sourceChar is in 0xA1-0xEE range */
1234 if ((uint8_t) (EXT_RANGE_END
- sourceChar
) <= (EXT_RANGE_END
- EXT_RANGE_BEGIN
)) {
1235 /* We currently support only Anudatta and Devanagari abbreviation sign */
1236 if (sourceChar
==0xBF || sourceChar
== 0xB8) {
1237 targetUniChar
= (sourceChar
==0xBF) ? DEV_ABBR_SIGN
: DEV_ANUDATTA
;
1239 /* find out if the mapping is valid in this state */
1240 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1241 *contextCharToUnicode
= NO_CHAR_MARKER
;
1243 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1244 if (data
->prevToUnicodeStatus
) {
1245 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1246 data
->prevToUnicodeStatus
= 0x0000;
1248 /* write to target */
1249 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1254 /* byte unit is unassigned */
1255 targetUniChar
= missingCharMarker
;
1256 *err
= U_INVALID_CHAR_FOUND
;
1258 /* only 0xA1 - 0xEE are legal after EXT char */
1259 *contextCharToUnicode
= NO_CHAR_MARKER
;
1260 *err
= U_ILLEGAL_CHAR_FOUND
;
1263 } else if (*contextCharToUnicode
==ISCII_INV
) {
1264 if (sourceChar
==ISCII_HALANT
) {
1265 targetUniChar
= 0x0020; /* replace with space accoding to Indic FAQ */
1267 targetUniChar
= ZWJ
;
1270 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1271 if (data
->prevToUnicodeStatus
) {
1272 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1273 data
->prevToUnicodeStatus
= 0x0000;
1275 /* write to target */
1276 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1278 *contextCharToUnicode
=NO_CHAR_MARKER
;
1281 /* look at the pre-context and perform special processing */
1282 switch (sourceChar
) {
1284 case EXT
: /*falls through*/
1286 *contextCharToUnicode
= (UChar
)sourceChar
;
1288 if (*toUnicodeStatus
!= missingCharMarker
) {
1289 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1290 if (data
->prevToUnicodeStatus
) {
1291 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1292 data
->prevToUnicodeStatus
= 0x0000;
1294 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1295 *toUnicodeStatus
= missingCharMarker
;
1299 /* handle double danda*/
1300 if (*contextCharToUnicode
== ISCII_DANDA
) {
1301 targetUniChar
= DOUBLE_DANDA
;
1302 /* clear the context */
1303 *contextCharToUnicode
= NO_CHAR_MARKER
;
1304 *toUnicodeStatus
= missingCharMarker
;
1306 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1307 *contextCharToUnicode
= sourceChar
;
1311 /* handle explicit halant */
1312 if (*contextCharToUnicode
== ISCII_HALANT
) {
1313 targetUniChar
= ZWNJ
;
1314 /* clear the context */
1315 *contextCharToUnicode
= NO_CHAR_MARKER
;
1317 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1318 *contextCharToUnicode
= sourceChar
;
1324 data
->resetToDefaultToUnicode
= TRUE
;
1325 GET_MAPPING(sourceChar
,targetUniChar
,data
)
1327 *contextCharToUnicode
= sourceChar
;
1330 case ISCII_VOWEL_SIGN_E
:
1333 for (; i
<vowelSignESpecialCases
[0][0]; i
++) {
1334 if (vowelSignESpecialCases
[i
][0]==(uint8_t)*contextCharToUnicode
) {
1335 targetUniChar
=vowelSignESpecialCases
[i
][1];
1341 /* find out if the mapping is valid in this state */
1342 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1343 /*targetUniChar += data->currentDeltaToUnicode ;*/
1344 *contextCharToUnicode
= NO_CHAR_MARKER
;
1345 *toUnicodeStatus
= missingCharMarker
;
1349 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1350 *contextCharToUnicode
= sourceChar
;
1354 /* handle soft halant */
1355 if (*contextCharToUnicode
== ISCII_HALANT
) {
1356 targetUniChar
= ZWJ
;
1357 /* clear the context */
1358 *contextCharToUnicode
= NO_CHAR_MARKER
;
1360 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->contextCharToUnicode
== 0xc0) {
1361 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1362 if (data
->prevToUnicodeStatus
) {
1363 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1364 data
->prevToUnicodeStatus
= 0x0000;
1366 /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
1367 * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
1369 targetUniChar
= PNJ_RRA
;
1370 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1371 if (U_SUCCESS(*err
)) {
1372 targetUniChar
= PNJ_SIGN_VIRAMA
;
1373 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1374 if (U_SUCCESS(*err
)) {
1375 targetUniChar
= PNJ_HA
;
1376 WRITE_TO_TARGET_TO_U(args
, source
, target
, args
->offsets
, (source
-args
->source
)-2, targetUniChar
, 0, err
);
1378 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1381 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_SIGN_VIRAMA
;
1382 args
->converter
->UCharErrorBuffer
[args
->converter
->UCharErrorBufferLength
++]= PNJ_HA
;
1384 *toUnicodeStatus
= missingCharMarker
;
1385 data
->contextCharToUnicode
= NO_CHAR_MARKER
;
1388 /* try to handle <CHAR> + ISCII_NUKTA special mappings */
1391 for (; i
<nuktaSpecialCases
[0][0]; i
++) {
1392 if (nuktaSpecialCases
[i
][0]==(uint8_t)
1393 *contextCharToUnicode
) {
1394 targetUniChar
=nuktaSpecialCases
[i
][1];
1400 /* find out if the mapping is valid in this state */
1401 if (validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
) {
1402 /*targetUniChar += data->currentDeltaToUnicode ;*/
1403 *contextCharToUnicode
= NO_CHAR_MARKER
;
1404 *toUnicodeStatus
= missingCharMarker
;
1405 if (data
->currentDeltaToUnicode
== PNJ_DELTA
) {
1406 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1407 if (data
->prevToUnicodeStatus
) {
1408 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1409 data
->prevToUnicodeStatus
= 0x0000;
1411 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),targetUniChar
,data
->currentDeltaToUnicode
,err
);
1416 /* else fall through to default */
1418 /* else fall through to default */
1420 default:GET_MAPPING(sourceChar
,targetUniChar
,data
)
1422 *contextCharToUnicode
= sourceChar
;
1426 if (*toUnicodeStatus
!= missingCharMarker
) {
1427 /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
1428 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& data
->prevToUnicodeStatus
!= 0 && uset_contains(PNJ_CONSONANT_SET
, data
->prevToUnicodeStatus
) &&
1429 (*toUnicodeStatus
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& (targetUniChar
+ PNJ_DELTA
) == data
->prevToUnicodeStatus
) {
1430 /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
1431 offset
= (int)(source
-args
->source
- 3);
1432 tempTargetUniChar
= PNJ_ADHAK
; /* This is necessary to avoid some compiler warnings. */
1433 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,tempTargetUniChar
,0,err
);
1434 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,offset
,data
->prevToUnicodeStatus
,0,err
);
1435 data
->prevToUnicodeStatus
= 0x0000; /* reset the previous unicode code point */
1436 *toUnicodeStatus
= missingCharMarker
;
1439 /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
1440 if (data
->prevToUnicodeStatus
) {
1441 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-1),data
->prevToUnicodeStatus
,0,err
);
1442 data
->prevToUnicodeStatus
= 0x0000;
1444 /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
1445 * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
1447 if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_BINDI
&& uset_contains(PNJ_BINDI_TIPPI_SET
, (*toUnicodeStatus
+ PNJ_DELTA
))) {
1448 targetUniChar
= PNJ_TIPPI
- PNJ_DELTA
;
1449 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,PNJ_DELTA
,err
);
1450 } else if (data
->currentDeltaToUnicode
== PNJ_DELTA
&& (targetUniChar
+ PNJ_DELTA
) == PNJ_SIGN_VIRAMA
&& uset_contains(PNJ_CONSONANT_SET
, (*toUnicodeStatus
+ PNJ_DELTA
))) {
1451 /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
1452 data
->prevToUnicodeStatus
= *toUnicodeStatus
+ PNJ_DELTA
;
1454 /* write the previously mapped codepoint */
1455 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1458 *toUnicodeStatus
= missingCharMarker
;
1461 if (targetUniChar
!= missingCharMarker
) {
1462 /* now save the targetUniChar for delayed write */
1463 *toUnicodeStatus
= (UChar
) targetUniChar
;
1464 if (data
->resetToDefaultToUnicode
==TRUE
) {
1465 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1466 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1467 data
->resetToDefaultToUnicode
=FALSE
;
1471 /* we reach here only if targetUniChar == missingCharMarker
1472 * so assign codes to reason and err
1474 *err
= U_INVALID_CHAR_FOUND
;
1476 args
->converter
->toUBytes
[0] = (uint8_t) sourceChar
;
1477 args
->converter
->toULength
= 1;
1482 *err
=U_BUFFER_OVERFLOW_ERROR
;
1487 if (U_SUCCESS(*err
) && args
->flush
&& source
== sourceLimit
) {
1488 /* end of the input stream */
1489 UConverter
*cnv
= args
->converter
;
1491 if (*contextCharToUnicode
==ATR
|| *contextCharToUnicode
==EXT
|| *contextCharToUnicode
==ISCII_INV
) {
1492 /* set toUBytes[] */
1493 cnv
->toUBytes
[0] = (uint8_t)*contextCharToUnicode
;
1496 /* avoid looping on truncated sequences */
1497 *contextCharToUnicode
= NO_CHAR_MARKER
;
1502 if (*toUnicodeStatus
!= missingCharMarker
) {
1503 /* output a remaining target character */
1504 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
- args
->source
-1),*toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1505 *toUnicodeStatus
= missingCharMarker
;
1509 args
->target
= target
;
1510 args
->source
= source
;
1513 /* structure for SafeClone calculations */
1514 struct cloneISCIIStruct
{
1516 UConverterDataISCII mydata
;
1520 _ISCII_SafeClone(const UConverter
*cnv
,
1522 int32_t *pBufferSize
,
1525 struct cloneISCIIStruct
* localClone
;
1526 int32_t bufferSizeNeeded
= sizeof(struct cloneISCIIStruct
);
1528 if (U_FAILURE(*status
)) {
1532 if (*pBufferSize
== 0) { /* 'preflighting' request - set needed size into *pBufferSize */
1533 *pBufferSize
= bufferSizeNeeded
;
1537 localClone
= (struct cloneISCIIStruct
*)stackBuffer
;
1538 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1540 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataISCII
));
1541 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1542 localClone
->cnv
.isExtraLocal
= TRUE
;
1544 return &localClone
->cnv
;
1548 _ISCIIGetUnicodeSet(const UConverter
*cnv
,
1549 const USetAdder
*sa
,
1550 UConverterUnicodeSet which
,
1551 UErrorCode
*pErrorCode
)
1553 int32_t idx
, script
;
1556 /* Since all ISCII versions allow switching to other ISCII
1557 scripts, we add all roundtrippable characters to this set. */
1558 sa
->addRange(sa
->set
, 0, ASCII_END
);
1559 for (script
= DEVANAGARI
; script
<= MALAYALAM
; script
++) {
1560 mask
= (uint8_t)(lookupInitialData
[script
].maskEnum
);
1561 for (idx
= 0; idx
< DELTA
; idx
++) {
1562 /* added check for TELUGU character */
1563 if ((validityTable
[idx
] & mask
) || (script
==TELUGU
&& idx
==0x31)) {
1564 sa
->add(sa
->set
, idx
+ (script
* DELTA
) + INDIC_BLOCK_BEGIN
);
1568 sa
->add(sa
->set
, DANDA
);
1569 sa
->add(sa
->set
, DOUBLE_DANDA
);
1570 sa
->add(sa
->set
, ZWNJ
);
1571 sa
->add(sa
->set
, ZWJ
);
1574 static const UConverterImpl _ISCIIImpl
={
1585 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1586 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1587 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1588 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1598 static const UConverterStaticData _ISCIIStaticData
={
1599 sizeof(UConverterStaticData
),
1612 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
1616 const UConverterSharedData _ISCIIData
={
1617 sizeof(UConverterSharedData
),
1627 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */