2 **********************************************************************
3 * Copyright (C) 2000-2008, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: ucnvisci.c
8 * tab size: 8 (not used)
11 * created on: 2001JUN26
12 * created by: Ram Viswanadha
14 * Date Name Description
15 * 24/7/2001 Ram Added support for EXT character handling
18 #include "unicode/utypes.h"
20 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
24 #include "unicode/ucnv.h"
26 #include "unicode/ucnv_cb.h"
27 #include "unicode/uset.h"
30 #define UCNV_OPTIONS_VERSION_MASK 0xf
33 #define ZWNJ 0x200c /* Zero Width Non Joiner */
34 #define ZWJ 0x200d /* Zero width Joiner */
35 #define INVALID_CHAR 0xffff
36 #define ATR 0xEF /* Attribute code */
37 #define EXT 0xF0 /* Extension code */
39 #define DOUBLE_DANDA 0x0965
40 #define ISCII_NUKTA 0xE9
41 #define ISCII_HALANT 0xE8
42 #define ISCII_DANDA 0xEA
43 #define ISCII_INV 0xD9
44 #define ISCII_VOWEL_SIGN_E 0xE0
45 #define INDIC_BLOCK_BEGIN 0x0900
46 #define INDIC_BLOCK_END 0x0D7F
47 #define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
48 #define VOCALLIC_RR 0x0931
50 #define ASCII_END 0xA0
51 #define NO_CHAR_MARKER 0xFFFE
52 #define TELUGU_DELTA DELTA * TELUGU
53 #define DEV_ABBR_SIGN 0x0970
54 #define DEV_ANUDATTA 0x0952
55 #define EXT_RANGE_BEGIN 0xA1
56 #define EXT_RANGE_END 0xEE
74 * Enumeration for switching code pages if <ATR>+<one of below values>
110 #define ISCII_CNV_PREFIX "ISCII,version="
113 UChar contextCharToUnicode
; /* previous Unicode codepoint for contextual analysis */
114 UChar contextCharFromUnicode
; /* previous Unicode codepoint for contextual analysis */
115 uint16_t defDeltaToUnicode
; /* delta for switching to default state when DEF is encountered */
116 uint16_t currentDeltaFromUnicode
;/* current delta in Indic block */
117 uint16_t currentDeltaToUnicode
; /* current delta in Indic block */
118 MaskEnum currentMaskFromUnicode
; /* mask for current state in toUnicode */
119 MaskEnum currentMaskToUnicode
; /* mask for current state in toUnicode */
120 MaskEnum defMaskToUnicode
; /* mask for default state in toUnicode */
121 UBool isFirstBuffer
; /* boolean for fromUnicode to see if we need to announce the first script */
122 UBool resetToDefaultToUnicode
; /* boolean for reseting to default delta and mask when a newline is encountered*/
123 char name
[sizeof(ISCII_CNV_PREFIX
) + 1];
124 }UConverterDataISCII
;
126 typedef struct LookupDataStruct
133 static const LookupDataStruct lookupInitialData
[]={
134 { DEVANAGARI
, DEV_MASK
, DEV
},
135 { BENGALI
, BNG_MASK
, BNG
},
136 { GURMUKHI
, PNJ_MASK
, PNJ
},
137 { GUJARATI
, GJR_MASK
, GJR
},
138 { ORIYA
, ORI_MASK
, ORI
},
139 { TAMIL
, TML_MASK
, TML
},
140 { TELUGU
, KND_MASK
, TLG
},
141 { KANNADA
, KND_MASK
, KND
},
142 { MALAYALAM
, MLM_MASK
, MLM
}
146 _ISCIIOpen(UConverter
*cnv
, const char *name
,const char *locale
,uint32_t options
, UErrorCode
*errorCode
){
147 cnv
->extraInfo
= uprv_malloc (sizeof (UConverterDataISCII
));
149 if(cnv
->extraInfo
!= NULL
) {
151 UConverterDataISCII
*converterData
=(UConverterDataISCII
*) cnv
->extraInfo
;
152 converterData
->contextCharToUnicode
=NO_CHAR_MARKER
;
153 cnv
->toUnicodeStatus
= missingCharMarker
;
154 converterData
->contextCharFromUnicode
=0x0000;
155 converterData
->resetToDefaultToUnicode
=FALSE
;
156 /* check if the version requested is supported */
157 if((options
& UCNV_OPTIONS_VERSION_MASK
) < 9){
158 /* initialize state variables */
159 converterData
->currentDeltaFromUnicode
=converterData
->currentDeltaToUnicode
=
160 converterData
->defDeltaToUnicode
=
161 (uint16_t)(lookupInitialData
[options
& UCNV_OPTIONS_VERSION_MASK
].uniLang
* DELTA
);
163 converterData
->currentMaskFromUnicode
= converterData
->currentMaskToUnicode
=
164 converterData
->defMaskToUnicode
=lookupInitialData
[options
& UCNV_OPTIONS_VERSION_MASK
].maskEnum
;
166 converterData
->isFirstBuffer
=TRUE
;
167 (void)uprv_strcpy(converterData
->name
, ISCII_CNV_PREFIX
);
168 len
= (int32_t)uprv_strlen(converterData
->name
);
169 converterData
->name
[len
]= (char)((options
& UCNV_OPTIONS_VERSION_MASK
) + '0');
170 converterData
->name
[len
+1]=0;
172 uprv_free(cnv
->extraInfo
);
173 cnv
->extraInfo
= NULL
;
174 *errorCode
= U_ILLEGAL_ARGUMENT_ERROR
;
178 *errorCode
=U_MEMORY_ALLOCATION_ERROR
;
182 _ISCIIClose(UConverter
*cnv
){
183 if(cnv
->extraInfo
!=NULL
) {
184 if(!cnv
->isExtraLocal
) {
185 uprv_free(cnv
->extraInfo
);
192 _ISCIIgetName(const UConverter
* cnv
){
194 UConverterDataISCII
* myData
= (UConverterDataISCII
*)cnv
->extraInfo
;
201 _ISCIIReset(UConverter
*cnv
, UConverterResetChoice choice
){
202 UConverterDataISCII
* data
=(UConverterDataISCII
*) (cnv
->extraInfo
);
203 if(choice
<=UCNV_RESET_TO_UNICODE
) {
204 cnv
->toUnicodeStatus
= missingCharMarker
;
206 data
->currentDeltaToUnicode
=data
->defDeltaToUnicode
;
207 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
208 data
->contextCharToUnicode
=NO_CHAR_MARKER
;
210 if(choice
!=UCNV_RESET_TO_UNICODE
) {
211 cnv
->fromUChar32
=0x0000;
212 data
->contextCharFromUnicode
=0x00;
213 data
->currentMaskFromUnicode
=data
->defMaskToUnicode
;
214 data
->currentDeltaFromUnicode
=data
->defDeltaToUnicode
;
215 data
->isFirstBuffer
=TRUE
;
216 data
->resetToDefaultToUnicode
=FALSE
;
221 * The values in validity table are indexed by the lower bits of Unicode
222 * range 0x0900 - 0x09ff. The values have a structure like:
223 * ---------------------------------------------------------------
224 * | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
225 * | | | | | ASM | KND | | |
226 * ---------------------------------------------------------------
227 * If a code point is valid in a particular script
228 * then that bit is turned on
230 * Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
231 * to represent these languages
233 * Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
234 * and combine and use 1 bit to represent these languages.
236 * TODO: It is probably easier to understand and maintain to change this
237 * to use uint16_t and give each of the 9 Unicode/script blocks its own bit.
240 static const uint8_t validityTable
[128] = {
241 /* This state table is tool generated please do not edit unless you know exactly what you are doing */
242 /* Note: This table was edited to mirror the Windows XP implementation */
243 /*ISCII:Valid:Unicode */
244 /*0xa0 : 0x00: 0x900 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
245 /*0xa1 : 0xb8: 0x901 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
246 /*0xa2 : 0xfe: 0x902 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
247 /*0xa3 : 0xbf: 0x903 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
248 /*0x00 : 0x00: 0x904 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
249 /*0xa4 : 0xff: 0x905 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
250 /*0xa5 : 0xff: 0x906 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
251 /*0xa6 : 0xff: 0x907 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
252 /*0xa7 : 0xff: 0x908 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
253 /*0xa8 : 0xff: 0x909 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
254 /*0xa9 : 0xff: 0x90a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
255 /*0xaa : 0xfe: 0x90b */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
256 /*0x00 : 0x00: 0x90c */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
257 /*0xae : 0x80: 0x90d */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
258 /*0xab : 0x87: 0x90e */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
259 /*0xac : 0xff: 0x90f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
260 /*0xad : 0xff: 0x910 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
261 /*0xb2 : 0x80: 0x911 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
262 /*0xaf : 0x87: 0x912 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
263 /*0xb0 : 0xff: 0x913 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
264 /*0xb1 : 0xff: 0x914 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
265 /*0xb3 : 0xff: 0x915 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
266 /*0xb4 : 0xfe: 0x916 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
267 /*0xb5 : 0xfe: 0x917 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
268 /*0xb6 : 0xfe: 0x918 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
269 /*0xb7 : 0xff: 0x919 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
270 /*0xb8 : 0xff: 0x91a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
271 /*0xb9 : 0xfe: 0x91b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
272 /*0xba : 0xff: 0x91c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
273 /*0xbb : 0xfe: 0x91d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
274 /*0xbc : 0xff: 0x91e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
275 /*0xbd : 0xff: 0x91f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
276 /*0xbe : 0xfe: 0x920 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
277 /*0xbf : 0xfe: 0x921 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
278 /*0xc0 : 0xfe: 0x922 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
279 /*0xc1 : 0xff: 0x923 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
280 /*0xc2 : 0xff: 0x924 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
281 /*0xc3 : 0xfe: 0x925 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
282 /*0xc4 : 0xfe: 0x926 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
283 /*0xc5 : 0xfe: 0x927 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
284 /*0xc6 : 0xff: 0x928 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
285 /*0xc7 : 0x81: 0x929 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ TML_MASK
,
286 /*0xc8 : 0xff: 0x92a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
287 /*0xc9 : 0xfe: 0x92b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
288 /*0xca : 0xfe: 0x92c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
289 /*0xcb : 0xfe: 0x92d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
290 /*0xcc : 0xfe: 0x92e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
291 /*0xcd : 0xff: 0x92f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
292 /*0xcf : 0xff: 0x930 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
293 /*0xd0 : 0x87: 0x931 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
294 /*0xd1 : 0xff: 0x932 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
295 /*0xd2 : 0xb7: 0x933 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
296 /*0xd3 : 0x83: 0x934 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ MLM_MASK
+ TML_MASK
,
297 /*0xd4 : 0xff: 0x935 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
298 /*0xd5 : 0xfe: 0x936 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
299 /*0xd6 : 0xbf: 0x937 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
300 /*0xd7 : 0xff: 0x938 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
301 /*0xd8 : 0xff: 0x939 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
302 /*0x00 : 0x00: 0x93A */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
303 /*0x00 : 0x00: 0x93B */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
304 /*0xe9 : 0xda: 0x93c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
305 /*0x00 : 0x00: 0x93d */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
306 /*0xda : 0xff: 0x93e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
307 /*0xdb : 0xff: 0x93f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
308 /*0xdc : 0xff: 0x940 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
309 /*0xdd : 0xff: 0x941 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
310 /*0xde : 0xff: 0x942 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
311 /*0xdf : 0xbe: 0x943 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
312 /*0x00 : 0x00: 0x944 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ BNG_MASK
+ KND_MASK
+ ZERO
+ ZERO
,
313 /*0xe3 : 0x80: 0x945 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
314 /*0xe0 : 0x87: 0x946 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
315 /*0xe1 : 0xff: 0x947 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
316 /*0xe2 : 0xff: 0x948 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
317 /*0xe7 : 0x80: 0x949 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
318 /*0xe4 : 0x87: 0x94a */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
319 /*0xe5 : 0xff: 0x94b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
320 /*0xe6 : 0xff: 0x94c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
321 /*0xe8 : 0xff: 0x94d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
322 /*0xec : 0x00: 0x94e */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
323 /*0xed : 0x00: 0x94f */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
324 /*0x00 : 0x00: 0x950 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
325 /*0x00 : 0x00: 0x951 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
326 /*0x00 : 0x00: 0x952 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
327 /*0x00 : 0x00: 0x953 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
328 /*0x00 : 0x00: 0x954 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
329 /*0x00 : 0x00: 0x955 */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
330 /*0x00 : 0x00: 0x956 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ ZERO
+ KND_MASK
+ ZERO
+ ZERO
,
331 /*0x00 : 0x00: 0x957 */ ZERO
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ MLM_MASK
+ ZERO
,
332 /*0x00 : 0x00: 0x958 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
333 /*0x00 : 0x00: 0x959 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
334 /*0x00 : 0x00: 0x95a */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
335 /*0x00 : 0x00: 0x95b */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
336 /*0x00 : 0x00: 0x95c */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
337 /*0x00 : 0x00: 0x95d */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
338 /*0x00 : 0x00: 0x95e */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
339 /*0xce : 0x98: 0x95f */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
340 /*0x00 : 0x00: 0x960 */ DEV_MASK
+ ZERO
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
341 /*0x00 : 0x00: 0x961 */ DEV_MASK
+ ZERO
+ ZERO
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ ZERO
,
342 /*0x00 : 0x00: 0x962 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
343 /*0x00 : 0x00: 0x963 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ BNG_MASK
+ ZERO
+ ZERO
+ ZERO
,
344 /*0xea : 0xf8: 0x964 */ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
345 /*0xeaea : 0x00: 0x965*/ DEV_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
346 /*0xf1 : 0xff: 0x966 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
347 /*0xf2 : 0xff: 0x967 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
348 /*0xf3 : 0xff: 0x968 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
349 /*0xf4 : 0xff: 0x969 */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
350 /*0xf5 : 0xff: 0x96a */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
351 /*0xf6 : 0xff: 0x96b */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
352 /*0xf7 : 0xff: 0x96c */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
353 /*0xf8 : 0xff: 0x96d */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
354 /*0xf9 : 0xff: 0x96e */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
355 /*0xfa : 0xff: 0x96f */ DEV_MASK
+ PNJ_MASK
+ GJR_MASK
+ ORI_MASK
+ BNG_MASK
+ KND_MASK
+ MLM_MASK
+ TML_MASK
,
356 /*0x00 : 0x80: 0x970 */ DEV_MASK
+ PNJ_MASK
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
,
359 * The length of the array is 128 to provide values for 0x900..0x97f.
360 * The last 15 entries for 0x971..0x97f of the validity table are all zero
361 * because no Indic script uses such Unicode code points.
363 /*0x00 : 0x00: 0x9yz */ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
+ ZERO
366 static const uint16_t fromUnicodeTable
[128]={
447 0xA1E9 ,/* 0x0950 */ /* OM Symbol */
496 static const uint16_t toUnicodeTable
[256]={
755 static const uint16_t vowelSignESpecialCases
[][2]={
756 { 2 /*length of array*/ , 0 },
760 static const uint16_t nuktaSpecialCases
[][2]={
761 { 16 /*length of array*/ , 0 },
779 #define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
780 /* write the targetUniChar to target */ \
781 if(target <targetLimit){ \
782 if(targetByteUnit <= 0xFF){ \
783 *(target)++ = (uint8_t)(targetByteUnit); \
785 *(offsets++) = (int32_t)(source - args->source-1); \
788 *(target)++ = (uint8_t)(targetByteUnit>>8); \
790 *(offsets++) = (int32_t)(source - args->source-1); \
792 if(target < targetLimit){ \
793 *(target)++ = (uint8_t) targetByteUnit; \
795 *(offsets++) = (int32_t)(source - args->source-1); \
798 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
799 (uint8_t) (targetByteUnit); \
800 *err = U_BUFFER_OVERFLOW_ERROR; \
804 if(targetByteUnit & 0xFF00){ \
805 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
806 (uint8_t) (targetByteUnit >>8); \
808 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
809 (uint8_t) (targetByteUnit); \
810 *err = U_BUFFER_OVERFLOW_ERROR; \
822 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs
* args
,
824 const UChar
*source
= args
->source
;
825 const UChar
*sourceLimit
= args
->sourceLimit
;
826 unsigned char *target
= (unsigned char *) args
->target
;
827 unsigned char *targetLimit
= (unsigned char *) args
->targetLimit
;
828 int32_t* offsets
= args
->offsets
;
829 uint32_t targetByteUnit
= 0x0000;
830 UChar32 sourceChar
= 0x0000;
831 UConverterDataISCII
*converterData
;
834 UBool deltaChanged
= FALSE
;
836 if ((args
->converter
== NULL
) || (args
->targetLimit
< args
->target
) || (args
->sourceLimit
< args
->source
)){
837 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
840 /* initialize data */
841 converterData
=(UConverterDataISCII
*)args
->converter
->extraInfo
;
842 newDelta
=converterData
->currentDeltaFromUnicode
;
843 range
= (uint16_t)(newDelta
/DELTA
);
845 if((sourceChar
= args
->converter
->fromUChar32
)!=0) {
849 /*writing the char to the output stream */
850 while(source
< sourceLimit
){
852 targetByteUnit
= missingCharMarker
;
854 sourceChar
= *source
++;
856 /*check if input is in ASCII and C0 control codes range*/
857 if (sourceChar
<= ASCII_END
) {
858 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,sourceChar
,err
);
862 if(sourceChar
== LF
){
863 targetByteUnit
= ATR
<<8;
864 targetByteUnit
+= (uint8_t) lookupInitialData
[range
].isciiLang
;
865 args
->converter
->fromUnicodeStatus
=sourceChar
;
866 /* now append ATR and language code */
867 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
876 /* contextChar has HALANT */
877 if(converterData
->contextCharFromUnicode
){
878 converterData
->contextCharFromUnicode
= 0x00;
879 targetByteUnit
= ISCII_HALANT
;
881 /* consume ZWNJ and continue */
882 converterData
->contextCharFromUnicode
= 0x00;
887 /* contextChar has HALANT */
888 if(converterData
->contextCharFromUnicode
){
889 targetByteUnit
= ISCII_NUKTA
;
891 targetByteUnit
=ISCII_INV
;
893 converterData
->contextCharFromUnicode
= 0x00;
896 /* is the sourceChar in the INDIC_RANGE? */
897 if((uint16_t)(INDIC_BLOCK_END
-sourceChar
) <= INDIC_RANGE
){
898 /* Danda and Double Danda are valid in Northern scripts.. since Unicode
899 * does not include these codepoints in all Northern scrips we need to
902 if(sourceChar
!= DANDA
&& sourceChar
!= DOUBLE_DANDA
){
903 /* find out to which block the souceChar belongs*/
904 range
=(uint16_t)((sourceChar
-INDIC_BLOCK_BEGIN
)/DELTA
);
905 newDelta
=(uint16_t)(range
*DELTA
);
907 /* Now are we in the same block as the previous? */
908 if(newDelta
!= converterData
->currentDeltaFromUnicode
|| converterData
->isFirstBuffer
){
909 converterData
->currentDeltaFromUnicode
= newDelta
;
910 converterData
->currentMaskFromUnicode
= lookupInitialData
[range
].maskEnum
;
912 converterData
->isFirstBuffer
=FALSE
;
914 /* Normalize all Indic codepoints to Devanagari and map them to ISCII */
915 /* now subtract the new delta from sourceChar*/
916 sourceChar
-= converterData
->currentDeltaFromUnicode
;
919 /* get the target byte unit */
920 targetByteUnit
=fromUnicodeTable
[(uint8_t)sourceChar
];
922 /* is the code point valid in current script? */
923 if((validityTable
[(uint8_t)sourceChar
] & converterData
->currentMaskFromUnicode
)==0){
924 /* Vocallic RR is assigned in ISCII Telugu and Unicode */
925 if(converterData
->currentDeltaFromUnicode
!=(TELUGU_DELTA
) || sourceChar
!=VOCALLIC_RR
){
926 targetByteUnit
=missingCharMarker
;
931 /* we are in a script block which is different than
932 * previous sourceChar's script block write ATR and language codes
935 temp
=(uint16_t)(ATR
<<8);
936 temp
+= (uint16_t)((uint8_t) lookupInitialData
[range
].isciiLang
);
939 /* now append ATR and language code */
940 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,temp
,err
);
946 /* reset context char */
947 converterData
->contextCharFromUnicode
= 0x00;
952 if(targetByteUnit
!= missingCharMarker
){
953 if(targetByteUnit
==ISCII_HALANT
){
954 converterData
->contextCharFromUnicode
= (UChar
)targetByteUnit
;
956 /* write targetByteUnit to target*/
957 WRITE_TO_TARGET_FROM_U(args
,offsets
,source
,target
,targetLimit
,targetByteUnit
,err
);
963 /* oops.. the code point is unassigned */
964 /*check if the char is a First surrogate*/
965 if(UTF_IS_SURROGATE(sourceChar
)) {
966 if(UTF_IS_SURROGATE_FIRST(sourceChar
)) {
968 /*look ahead to find the trail surrogate*/
969 if(source
< sourceLimit
) {
970 /* test the following code unit */
971 UChar trail
= (*source
);
972 if(UTF_IS_SECOND_SURROGATE(trail
)) {
974 sourceChar
=UTF16_GET_PAIR_VALUE(sourceChar
, trail
);
975 *err
=U_INVALID_CHAR_FOUND
;
976 /* convert this surrogate code point */
977 /* exit this condition tree */
979 /* this is an unmatched lead code unit (1st surrogate) */
980 /* callback(illegal) */
981 *err
=U_ILLEGAL_CHAR_FOUND
;
988 /* this is an unmatched trail code unit (2nd surrogate) */
989 /* callback(illegal) */
990 *err
=U_ILLEGAL_CHAR_FOUND
;
993 /* callback(unassigned) for a BMP code point */
994 *err
= U_INVALID_CHAR_FOUND
;
997 args
->converter
->fromUChar32
=sourceChar
;
1000 }/* end while(mySourceIndex<mySourceLength) */
1002 /*save the state and return */
1003 args
->source
= source
;
1004 args
->target
= (char*)target
;
1007 static const uint16_t lookupTable
[][2]={
1008 { ZERO
, ZERO
}, /*DEFALT*/
1009 { ZERO
, ZERO
}, /*ROMAN*/
1010 { DEVANAGARI
, DEV_MASK
},
1011 { BENGALI
, BNG_MASK
},
1012 { TAMIL
, TML_MASK
},
1013 { TELUGU
, KND_MASK
},
1014 { BENGALI
, BNG_MASK
},
1015 { ORIYA
, ORI_MASK
},
1016 { KANNADA
, KND_MASK
},
1017 { MALAYALAM
, MLM_MASK
},
1018 { GUJARATI
, GJR_MASK
},
1019 { GURMUKHI
, PNJ_MASK
}
1023 #define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
1024 /* add offset to current Indic Block */ \
1025 if(targetUniChar>ASCII_END && \
1026 targetUniChar != ZWJ && \
1027 targetUniChar != ZWNJ && \
1028 targetUniChar != DANDA && \
1029 targetUniChar != DOUBLE_DANDA){ \
1031 targetUniChar+=(uint16_t)(delta); \
1033 /* now write the targetUniChar */ \
1034 if(target<args->targetLimit){ \
1035 *(target)++ = (UChar)targetUniChar; \
1037 *(offsets)++ = (int32_t)(offset); \
1040 args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++] = \
1041 (UChar)targetUniChar; \
1042 *err = U_BUFFER_OVERFLOW_ERROR; \
1046 #define GET_MAPPING(sourceChar,targetUniChar,data){ \
1047 targetUniChar = toUnicodeTable[(sourceChar)] ; \
1048 /* is the code point valid in current script? */ \
1049 if(sourceChar> ASCII_END && \
1050 (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){ \
1051 /* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
1052 if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
1053 targetUniChar!=VOCALLIC_RR){ \
1054 targetUniChar=missingCharMarker; \
1060 * Rules for ISCII to Unicode converter
1061 * ISCII is stateful encoding. To convert ISCII bytes to Unicode,
1062 * which has both precomposed and decomposed forms characters
1063 * pre-context and post-context need to be considered.
1066 * i) ATR : Attribute code is used to declare the font and script switching.
1067 * Currently we only switch scripts and font codes consumed without generating an error
1068 * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
1069 * obsolete characters
1071 * i) Halant: if preceeded by a halant then it is a explicit halant
1073 * a) if preceeded by a halant then it is a soft halant
1074 * b) if preceeded by specific consonants and the ligatures have pre-composed
1075 * characters in Unicode then convert to pre-composed characters
1076 * iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
1081 UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs
*args
,
1083 const char *source
= ( char *) args
->source
;
1084 UChar
*target
= args
->target
;
1085 const char *sourceLimit
= args
->sourceLimit
;
1086 const UChar
* targetLimit
= args
->targetLimit
;
1087 uint32_t targetUniChar
= 0x0000;
1088 uint8_t sourceChar
= 0x0000;
1089 UConverterDataISCII
* data
;
1090 UChar32
* toUnicodeStatus
=NULL
;
1091 UChar
* contextCharToUnicode
= NULL
;
1095 if ((args
->converter
== NULL
) || (target
< args
->target
) || (source
< args
->source
)){
1096 *err
= U_ILLEGAL_ARGUMENT_ERROR
;
1100 data
= (UConverterDataISCII
*)(args
->converter
->extraInfo
);
1101 contextCharToUnicode
= &data
->contextCharToUnicode
; /* contains previous ISCII codepoint visited */
1102 toUnicodeStatus
= (UChar32
*)&args
->converter
->toUnicodeStatus
;/* contains the mapping to Unicode of the above codepoint*/
1104 while(source
<sourceLimit
){
1106 targetUniChar
= missingCharMarker
;
1108 if(target
< targetLimit
){
1109 sourceChar
= (unsigned char)*(source
)++;
1111 /* look at the post-context preform special processing */
1112 if(*contextCharToUnicode
==ATR
){
1114 /* If we have ATR in *contextCharToUnicode then we need to change our
1115 * state to the Indic Script specified by sourceChar
1118 /* check if the sourceChar is supported script range*/
1119 if((uint8_t)(PNJ
-sourceChar
)<=PNJ
-DEV
){
1120 data
->currentDeltaToUnicode
=
1121 (uint16_t)(lookupTable
[sourceChar
& 0x0F][0] * DELTA
);
1122 data
->currentMaskToUnicode
=
1123 (MaskEnum
)lookupTable
[sourceChar
& 0x0F][1];
1125 else if(sourceChar
==DEF
){
1126 /* switch back to default */
1127 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1128 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1130 if((sourceChar
>= 0x21 && sourceChar
<= 0x3F)){
1131 /* these are display codes consume and continue */
1133 *err
=U_ILLEGAL_CHAR_FOUND
;
1135 *contextCharToUnicode
=NO_CHAR_MARKER
;
1141 *contextCharToUnicode
=NO_CHAR_MARKER
;
1145 }else if(*contextCharToUnicode
==EXT
){
1146 /* check if sourceChar is in 0xA1-0xEE range */
1147 if((uint8_t) (EXT_RANGE_END
- sourceChar
) <= (EXT_RANGE_END
- EXT_RANGE_BEGIN
)){
1148 /* We currently support only Anudatta and Devanagari abbreviation sign */
1149 if(sourceChar
==0xBF || sourceChar
== 0xB8){
1150 targetUniChar
= (sourceChar
==0xBF) ? DEV_ABBR_SIGN
: DEV_ANUDATTA
;
1152 /* find out if the mapping is valid in this state */
1153 if(validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
){
1155 *contextCharToUnicode
= NO_CHAR_MARKER
;
1157 /* write to target */
1158 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),
1159 targetUniChar
,data
->currentDeltaToUnicode
,err
);
1164 /* byte unit is unassigned */
1165 targetUniChar
= missingCharMarker
;
1166 *err
= U_INVALID_CHAR_FOUND
;
1168 /* only 0xA1 - 0xEE are legal after EXT char */
1169 *contextCharToUnicode
= NO_CHAR_MARKER
;
1170 *err
= U_ILLEGAL_CHAR_FOUND
;
1173 }else if(*contextCharToUnicode
==ISCII_INV
){
1174 if(sourceChar
==ISCII_HALANT
){
1175 targetUniChar
= 0x0020; /* replace with space accoding to Indic FAQ */
1177 targetUniChar
= ZWJ
;
1180 /* write to target */
1181 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),
1182 targetUniChar
,data
->currentDeltaToUnicode
,err
);
1184 *contextCharToUnicode
=NO_CHAR_MARKER
;
1187 /* look at the pre-context and perform special processing */
1190 case EXT
: /*falls through*/
1192 *contextCharToUnicode
= (UChar
)sourceChar
;
1194 if(*toUnicodeStatus
!= missingCharMarker
){
1196 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),
1197 *toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1198 *toUnicodeStatus
= missingCharMarker
;
1202 /* handle double danda*/
1203 if(*contextCharToUnicode
== ISCII_DANDA
){
1204 targetUniChar
= DOUBLE_DANDA
;
1205 /* clear the context */
1206 *contextCharToUnicode
= NO_CHAR_MARKER
;
1207 *toUnicodeStatus
= missingCharMarker
;
1209 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1210 *contextCharToUnicode
= sourceChar
;
1214 /* handle explicit halant */
1215 if(*contextCharToUnicode
== ISCII_HALANT
){
1216 targetUniChar
= ZWNJ
;
1217 /* clear the context */
1218 *contextCharToUnicode
= NO_CHAR_MARKER
;
1220 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1221 *contextCharToUnicode
= sourceChar
;
1227 data
->resetToDefaultToUnicode
= TRUE
;
1228 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1229 *contextCharToUnicode
= sourceChar
;
1232 case ISCII_VOWEL_SIGN_E
:
1235 for( ;i
<vowelSignESpecialCases
[0][0];i
++){
1236 if(vowelSignESpecialCases
[i
][0]==(uint8_t)*contextCharToUnicode
){
1237 targetUniChar
=vowelSignESpecialCases
[i
][1];
1243 /* find out if the mapping is valid in this state */
1244 if(validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
){
1245 /*targetUniChar += data->currentDeltaToUnicode ;*/
1246 *contextCharToUnicode
= NO_CHAR_MARKER
;
1247 *toUnicodeStatus
= missingCharMarker
;
1251 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1252 *contextCharToUnicode
= sourceChar
;
1256 /* handle soft halant */
1257 if(*contextCharToUnicode
== ISCII_HALANT
){
1258 targetUniChar
= ZWJ
;
1259 /* clear the context */
1260 *contextCharToUnicode
= NO_CHAR_MARKER
;
1263 /* try to handle <CHAR> + ISCII_NUKTA special mappings */
1266 for( ;i
<nuktaSpecialCases
[0][0];i
++){
1267 if(nuktaSpecialCases
[i
][0]==(uint8_t)*contextCharToUnicode
){
1268 targetUniChar
=nuktaSpecialCases
[i
][1];
1274 /* find out if the mapping is valid in this state */
1275 if(validityTable
[(uint8_t)targetUniChar
] & data
->currentMaskToUnicode
){
1276 /*targetUniChar += data->currentDeltaToUnicode ;*/
1277 *contextCharToUnicode
= NO_CHAR_MARKER
;
1278 *toUnicodeStatus
= missingCharMarker
;
1281 /* else fall through to default */
1283 /* else fall through to default */
1286 GET_MAPPING(sourceChar
,targetUniChar
,data
);
1287 *contextCharToUnicode
= sourceChar
;
1292 if(*toUnicodeStatus
!= missingCharMarker
){
1293 /* write the previously mapped codepoint */
1294 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
-args
->source
-2),
1295 *toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1296 *toUnicodeStatus
= missingCharMarker
;
1300 if(targetUniChar
!= missingCharMarker
){
1301 /* now save the targetUniChar for delayed write */
1302 *toUnicodeStatus
= (UChar
) targetUniChar
;
1303 if(data
->resetToDefaultToUnicode
==TRUE
){
1304 data
->currentDeltaToUnicode
= data
->defDeltaToUnicode
;
1305 data
->currentMaskToUnicode
= data
->defMaskToUnicode
;
1306 data
->resetToDefaultToUnicode
=FALSE
;
1310 /* we reach here only if targetUniChar == missingCharMarker
1311 * so assign codes to reason and err
1313 *err
= U_INVALID_CHAR_FOUND
;
1315 args
->converter
->toUBytes
[0] = (uint8_t) sourceChar
;
1316 args
->converter
->toULength
= 1;
1322 *err
=U_BUFFER_OVERFLOW_ERROR
;
1327 if(U_SUCCESS(*err
) && args
->flush
&& source
== sourceLimit
) {
1328 /* end of the input stream */
1329 UConverter
*cnv
= args
->converter
;
1331 if(*contextCharToUnicode
==ATR
|| *contextCharToUnicode
==EXT
|| *contextCharToUnicode
==ISCII_INV
){
1332 /* set toUBytes[] */
1333 cnv
->toUBytes
[0] = (uint8_t)*contextCharToUnicode
;
1336 /* avoid looping on truncated sequences */
1337 *contextCharToUnicode
= NO_CHAR_MARKER
;
1342 if(*toUnicodeStatus
!= missingCharMarker
) {
1343 /* output a remaining target character */
1344 WRITE_TO_TARGET_TO_U(args
,source
,target
,args
->offsets
,(source
- args
->source
-1),
1345 *toUnicodeStatus
,data
->currentDeltaToUnicode
,err
);
1346 *toUnicodeStatus
= missingCharMarker
;
1350 args
->target
= target
;
1351 args
->source
= source
;
1354 /* structure for SafeClone calculations */
1355 struct cloneISCIIStruct
1358 UConverterDataISCII mydata
;
1363 _ISCII_SafeClone(const UConverter
*cnv
,
1365 int32_t *pBufferSize
,
1368 struct cloneISCIIStruct
* localClone
;
1369 int32_t bufferSizeNeeded
= sizeof(struct cloneISCIIStruct
);
1371 if (U_FAILURE(*status
)){
1375 if (*pBufferSize
== 0){ /* 'preflighting' request - set needed size into *pBufferSize */
1376 *pBufferSize
= bufferSizeNeeded
;
1380 localClone
= (struct cloneISCIIStruct
*)stackBuffer
;
1381 /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
1383 uprv_memcpy(&localClone
->mydata
, cnv
->extraInfo
, sizeof(UConverterDataISCII
));
1384 localClone
->cnv
.extraInfo
= &localClone
->mydata
;
1385 localClone
->cnv
.isExtraLocal
= TRUE
;
1387 return &localClone
->cnv
;
1391 _ISCIIGetUnicodeSet(const UConverter
*cnv
,
1392 const USetAdder
*sa
,
1393 UConverterUnicodeSet which
,
1394 UErrorCode
*pErrorCode
)
1396 int32_t idx
, script
;
1399 /* Since all ISCII versions allow switching to other ISCII
1400 scripts, we add all roundtrippable characters to this set. */
1401 sa
->addRange(sa
->set
, 0, ASCII_END
);
1402 for (script
= DEVANAGARI
; script
<= MALAYALAM
; script
++) {
1403 mask
= (uint8_t)(lookupInitialData
[script
].maskEnum
);
1404 for (idx
= 0; idx
< DELTA
; idx
++) {
1405 /* added check for TELUGU character */
1406 if ((validityTable
[idx
] & mask
) || (script
==TELUGU
&& idx
==0x31)) {
1407 sa
->add(sa
->set
, idx
+ (script
* DELTA
) + INDIC_BLOCK_BEGIN
);
1412 sa
->add(sa
->set
, DANDA
);
1413 sa
->add(sa
->set
, DOUBLE_DANDA
);
1414 sa
->add(sa
->set
, ZWNJ
);
1415 sa
->add(sa
->set
, ZWJ
);
1418 static const UConverterImpl _ISCIIImpl
={
1429 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1430 UConverter_toUnicode_ISCII_OFFSETS_LOGIC
,
1431 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1432 UConverter_fromUnicode_ISCII_OFFSETS_LOGIC
,
1442 static const UConverterStaticData _ISCIIStaticData
={
1443 sizeof(UConverterStaticData
),
1456 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
1460 const UConverterSharedData _ISCIIData
={
1461 sizeof(UConverterSharedData
),
1471 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */