-/*
+/*
**********************************************************************
-* Copyright (C) 2000-2003, International Business Machines
+* Copyright (C) 2000-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucnvisci.c
*
* created on: 2001JUN26
* created by: Ram Viswanadha
-*
+*
* Date Name Description
* 24/7/2001 Ram Added support for EXT character handling
*/
#include "unicode/utypes.h"
-#if !UCONFIG_NO_LEGACY_CONVERSION
+#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
+#include "unicode/ucnv.h"
+#include "unicode/ucnv_cb.h"
+#include "unicode/utf16.h"
#include "cmemory.h"
-#include "unicode/ucnv_err.h"
#include "ucnv_bld.h"
-#include "unicode/ucnv.h"
#include "ucnv_cnv.h"
-#include "unicode/ucnv_cb.h"
-#include "unicode/uset.h"
#include "cstring.h"
+#include "uassert.h"
#define UCNV_OPTIONS_VERSION_MASK 0xf
#define NUKTA 0x093c
-#define HALANT 0x094d
+#define HALANT 0x094d
#define ZWNJ 0x200c /* Zero Width Non Joiner */
#define ZWJ 0x200d /* Zero width Joiner */
-#define INVALID_CHAR 0xffff
+#define INVALID_CHAR 0xffff
#define ATR 0xEF /* Attribute code */
#define EXT 0xF0 /* Extension code */
#define DANDA 0x0964
#define ISCII_HALANT 0xE8
#define ISCII_DANDA 0xEA
#define ISCII_INV 0xD9
+#define ISCII_VOWEL_SIGN_E 0xE0
#define INDIC_BLOCK_BEGIN 0x0900
-#define INDIC_BLOCK_END 0x0D7F
+#define INDIC_BLOCK_END 0x0D7F
#define INDIC_RANGE (INDIC_BLOCK_END - INDIC_BLOCK_BEGIN)
#define VOCALLIC_RR 0x0931
#define LF 0x0A
#define EXT_RANGE_BEGIN 0xA1
#define EXT_RANGE_END 0xEE
+#define PNJ_DELTA 0x0100
+#define PNJ_BINDI 0x0A02
+#define PNJ_TIPPI 0x0A70
+#define PNJ_SIGN_VIRAMA 0x0A4D
+#define PNJ_ADHAK 0x0A71
+#define PNJ_HA 0x0A39
+#define PNJ_RRA 0x0A5C
-typedef enum {
+typedef enum {
DEVANAGARI =0,
BENGALI,
GURMUKHI,
DELTA=0x80
}UniLang;
-
/**
- * Enumeration for switching code pages if <ATX>+<one of below values>
+ * Enumeration for switching code pages if <ATR>+<one of below values>
* is encountered
*/
typedef enum {
- DEF =0x40,
- RMN =0x41,
- DEV =0x42,
- BNG =0x43,
- TML =0x44,
- TLG =0x45,
- ASM =0x46,
- ORI =0x47,
- KND =0x48,
- MLM =0x49,
- GJR =0x4A,
- PNJ =0x4B,
- ARB =0x71,
- PES =0x72,
- URD =0x73,
- SND =0x74,
- KSM =0x75,
- PST =0x76
+ DEF = 0x40,
+ RMN = 0x41,
+ DEV = 0x42,
+ BNG = 0x43,
+ TML = 0x44,
+ TLG = 0x45,
+ ASM = 0x46,
+ ORI = 0x47,
+ KND = 0x48,
+ MLM = 0x49,
+ GJR = 0x4A,
+ PNJ = 0x4B,
+ ARB = 0x71,
+ PES = 0x72,
+ URD = 0x73,
+ SND = 0x74,
+ KSM = 0x75,
+ PST = 0x76
}ISCIILang;
-typedef enum{
+typedef enum {
DEV_MASK =0x80,
PNJ_MASK =0x40,
GJR_MASK =0x20,
KND_MASK =0x04,
MLM_MASK =0x02,
TML_MASK =0x01,
- ZERO =0x00
+ ZERO =0x00
}MaskEnum;
-typedef struct{
- UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
- UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
- uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
- uint16_t currentDeltaFromUnicode;/* current delta in Indic block */
- uint16_t currentDeltaToUnicode; /* current delta in Indic block */
- MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
- MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
- MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
- UBool isFirstBuffer;
- char name[30];
-}UConverterDataISCII;
-
-static const uint16_t lookupInitialData[][3]={
+#define ISCII_CNV_PREFIX "ISCII,version="
+
+typedef struct {
+ UChar contextCharToUnicode; /* previous Unicode codepoint for contextual analysis */
+ UChar contextCharFromUnicode; /* previous Unicode codepoint for contextual analysis */
+ uint16_t defDeltaToUnicode; /* delta for switching to default state when DEF is encountered */
+ uint16_t currentDeltaFromUnicode; /* current delta in Indic block */
+ uint16_t currentDeltaToUnicode; /* current delta in Indic block */
+ MaskEnum currentMaskFromUnicode; /* mask for current state in toUnicode */
+ MaskEnum currentMaskToUnicode; /* mask for current state in toUnicode */
+ MaskEnum defMaskToUnicode; /* mask for default state in toUnicode */
+ UBool isFirstBuffer; /* boolean for fromUnicode to see if we need to announce the first script */
+ UBool resetToDefaultToUnicode; /* boolean for reseting to default delta and mask when a newline is encountered*/
+ char name[sizeof(ISCII_CNV_PREFIX) + 1];
+ UChar32 prevToUnicodeStatus; /* Hold the previous toUnicodeStatus. This is necessary because we may need to know the last two code points. */
+} UConverterDataISCII;
+
+typedef struct LookupDataStruct {
+ UniLang uniLang;
+ MaskEnum maskEnum;
+ ISCIILang isciiLang;
+} LookupDataStruct;
+
+static const LookupDataStruct lookupInitialData[]={
{ DEVANAGARI, DEV_MASK, DEV },
{ BENGALI, BNG_MASK, BNG },
{ GURMUKHI, PNJ_MASK, PNJ },
{ KANNADA, KND_MASK, KND },
{ MALAYALAM, MLM_MASK, MLM }
};
-
-static void
-_ISCIIOpen(UConverter *cnv, const char *name,const char *locale,uint32_t options, UErrorCode *errorCode){
- cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISCII));
- if(cnv->extraInfo != NULL) {
+/*
+ * For special handling of certain Gurmukhi characters.
+ * Bit 0 (value 1): PNJ consonant
+ * Bit 1 (value 2): PNJ Bindi Tippi
+ */
+static const uint8_t pnjMap[80] = {
+ /* 0A00..0A0F */
+ 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
+ /* 0A10..0A1F */
+ 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ /* 0A20..0A2F */
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3,
+ /* 0A30..0A3F */
+ 3, 0, 0, 0, 0, 3, 3, 0, 3, 3, 0, 0, 0, 0, 0, 2,
+ /* 0A40..0A4F */
+ 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static UBool
+isPNJConsonant(UChar32 c) {
+ if (c < 0xa00 || 0xa50 <= c) {
+ return FALSE;
+ } else {
+ return (UBool)(pnjMap[c - 0xa00] & 1);
+ }
+}
+
+static UBool
+isPNJBindiTippi(UChar32 c) {
+ if (c < 0xa00 || 0xa50 <= c) {
+ return FALSE;
+ } else {
+ return (UBool)(pnjMap[c - 0xa00] >> 1);
+ }
+}
+
+static void _ISCIIOpen(UConverter *cnv, UConverterLoadArgs *pArgs, UErrorCode *errorCode) {
+ if(pArgs->onlyTestIsLoadable) {
+ return;
+ }
+
+ cnv->extraInfo = uprv_malloc(sizeof(UConverterDataISCII));
+
+ if (cnv->extraInfo != NULL) {
int32_t len=0;
- UConverterDataISCII *converterData=(UConverterDataISCII *) cnv->extraInfo;
+ UConverterDataISCII *converterData=
+ (UConverterDataISCII *) cnv->extraInfo;
converterData->contextCharToUnicode=NO_CHAR_MARKER;
cnv->toUnicodeStatus = missingCharMarker;
converterData->contextCharFromUnicode=0x0000;
+ converterData->resetToDefaultToUnicode=FALSE;
/* check if the version requested is supported */
- if((options & UCNV_OPTIONS_VERSION_MASK) < 9){
+ if ((pArgs->options & UCNV_OPTIONS_VERSION_MASK) < 9) {
/* initialize state variables */
- converterData->currentDeltaFromUnicode=converterData->currentDeltaToUnicode=
- converterData->defDeltaToUnicode=
- (uint16_t)(lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK][0] * DELTA);
+ converterData->currentDeltaFromUnicode
+ = converterData->currentDeltaToUnicode
+ = converterData->defDeltaToUnicode = (uint16_t)(lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].uniLang * DELTA);
- converterData->currentMaskFromUnicode = converterData->currentMaskToUnicode =
- converterData->defMaskToUnicode=lookupInitialData[options & UCNV_OPTIONS_VERSION_MASK][1];
+ converterData->currentMaskFromUnicode
+ = converterData->currentMaskToUnicode
+ = converterData->defMaskToUnicode = lookupInitialData[pArgs->options & UCNV_OPTIONS_VERSION_MASK].maskEnum;
converterData->isFirstBuffer=TRUE;
- uprv_strcpy(converterData->name,"ISCII,version=");
+ (void)uprv_strcpy(converterData->name, ISCII_CNV_PREFIX);
len = (int32_t)uprv_strlen(converterData->name);
- converterData->name[len]= (char)((options & UCNV_OPTIONS_VERSION_MASK) + '0');
+ converterData->name[len]= (char)((pArgs->options & UCNV_OPTIONS_VERSION_MASK) + '0');
converterData->name[len+1]=0;
- }else{
+
+ converterData->prevToUnicodeStatus = 0x0000;
+ } else {
uprv_free(cnv->extraInfo);
cnv->extraInfo = NULL;
*errorCode = U_ILLEGAL_ARGUMENT_ERROR;
}
- }else{
+ } else {
*errorCode =U_MEMORY_ALLOCATION_ERROR;
}
}
-static void
-_ISCIIClose(UConverter *cnv){
- if(cnv->extraInfo!=NULL) {
- if(!cnv->isExtraLocal) {
+
+static void _ISCIIClose(UConverter *cnv) {
+ if (cnv->extraInfo!=NULL) {
+ if (!cnv->isExtraLocal) {
uprv_free(cnv->extraInfo);
}
cnv->extraInfo=NULL;
}
}
-static const char*
-_ISCIIgetName(const UConverter* cnv){
- if(cnv->extraInfo){
+static const char* _ISCIIgetName(const UConverter* cnv) {
+ if (cnv->extraInfo) {
UConverterDataISCII* myData= (UConverterDataISCII*)cnv->extraInfo;
return myData->name;
}
return NULL;
}
-static void
-_ISCIIReset(UConverter *cnv, UConverterResetChoice choice){
+static void _ISCIIReset(UConverter *cnv, UConverterResetChoice choice) {
UConverterDataISCII* data =(UConverterDataISCII *) (cnv->extraInfo);
- if(choice<=UCNV_RESET_TO_UNICODE) {
+ if (choice<=UCNV_RESET_TO_UNICODE) {
cnv->toUnicodeStatus = missingCharMarker;
cnv->mode=0;
data->currentDeltaToUnicode=data->defDeltaToUnicode;
data->currentMaskToUnicode = data->defMaskToUnicode;
data->contextCharToUnicode=NO_CHAR_MARKER;
+ data->prevToUnicodeStatus = 0x0000;
}
- if(choice!=UCNV_RESET_TO_UNICODE) {
- cnv->fromUSurrogateLead=0x0000;
+ if (choice!=UCNV_RESET_TO_UNICODE) {
+ cnv->fromUChar32=0x0000;
data->contextCharFromUnicode=0x00;
- data->currentMaskFromUnicode=data->defDeltaToUnicode;
+ data->currentMaskFromUnicode=data->defMaskToUnicode;
data->currentDeltaFromUnicode=data->defDeltaToUnicode;
+ data->isFirstBuffer=TRUE;
+ data->resetToDefaultToUnicode=FALSE;
}
- data->isFirstBuffer=TRUE;
-
}
-/**
- * The values in validity table are indexed by the lower bits of Unicode
- * range 0x0900 - 0x09ff. The values have a structure like:
+/**
+ * The values in validity table are indexed by the lower bits of Unicode
+ * range 0x0900 - 0x09ff. The values have a structure like:
* ---------------------------------------------------------------
* | DEV | PNJ | GJR | ORI | BNG | TLG | MLM | TML |
- * | | | | | ASM | KND | | |
+ * | | | | | ASM | KND | | |
* ---------------------------------------------------------------
- * If a code point is valid in a particular script
+ * If a code point is valid in a particular script
* then that bit is turned on
- *
+ *
* Unicode does not distinguish between Bengali and Assamese so we use 1 bit for
* to represent these languages
- *
+ *
* Telugu and Kannada have same codepoints except for Vocallic_RR which we special case
* and combine and use 1 bit to represent these languages.
*
*/
static const uint8_t validityTable[128] = {
-/* This state table is tool generated please donot edit unless you know exactly what you are doing */
+/* This state table is tool generated please do not edit unless you know exactly what you are doing */
+/* Note: This table was edited to mirror the Windows XP implementation */
/*ISCII:Valid:Unicode */
/*0xa0 : 0x00: 0x900 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
-/*0xa1 : 0xb8: 0x901 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
-/*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
-/*0xa3 : 0xbf: 0x903 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0x00 : 0x00: 0x904 */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0xa1 : 0xb8: 0x901 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
+/*0xa2 : 0xfe: 0x902 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
+/*0xa3 : 0xbf: 0x903 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
+/*0x00 : 0x00: 0x904 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xa4 : 0xff: 0x905 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xa5 : 0xff: 0x906 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xa6 : 0xff: 0x907 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xa7 : 0xff: 0x908 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xa8 : 0xff: 0x909 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xa9 : 0xff: 0x90a */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0xaa : 0xfe: 0x90b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
+/*0xaa : 0xfe: 0x90b */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
/*0x00 : 0x00: 0x90c */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
-/*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0xae : 0x80: 0x90d */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xab : 0x87: 0x90e */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xac : 0xff: 0x90f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xad : 0xff: 0x910 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0xb2 : 0x80: 0x911 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xaf : 0x87: 0x912 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xb0 : 0xff: 0x913 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xb1 : 0xff: 0x914 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xc9 : 0xfe: 0x92b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
/*0xca : 0xfe: 0x92c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
/*0xcb : 0xfe: 0x92d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
-/*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
+/*0xcc : 0xfe: 0x92e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xcd : 0xff: 0x92f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xcf : 0xff: 0x930 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xd0 : 0x87: 0x931 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
/*0xd1 : 0xff: 0x932 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0xd2 : 0xb7: 0x933 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
+/*0xd2 : 0xb7: 0x933 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xd3 : 0x83: 0x934 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + MLM_MASK + TML_MASK ,
-/*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
+/*0xd4 : 0xff: 0x935 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xd5 : 0xfe: 0x936 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
/*0xd6 : 0xbf: 0x937 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xd7 : 0xff: 0x938 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xd8 : 0xff: 0x939 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0x00 : 0x00: 0x93A */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x93B */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
-/*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO ,
+/*0xe9 : 0xda: 0x93c */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x93d */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xda : 0xff: 0x93e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xdb : 0xff: 0x93f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xdd : 0xff: 0x941 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xde : 0xff: 0x942 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xdf : 0xbe: 0x943 */ DEV_MASK + ZERO + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
-/*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
-/*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0x00 : 0x00: 0x944 */ DEV_MASK + ZERO + GJR_MASK + ZERO + BNG_MASK + KND_MASK + ZERO + ZERO ,
+/*0xe3 : 0x80: 0x945 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xe0 : 0x87: 0x946 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xe1 : 0xff: 0x947 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xe2 : 0xff: 0x948 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0xe7 : 0x80: 0x949 */ DEV_MASK + ZERO + GJR_MASK + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xe4 : 0x87: 0x94a */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + KND_MASK + MLM_MASK + TML_MASK ,
/*0xe5 : 0xff: 0x94b */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xe6 : 0xff: 0x94c */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0x00 : 0x00: 0x954 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x955 */ ZERO + ZERO + ZERO + ZERO + ZERO + KND_MASK + ZERO + ZERO ,
/*0x00 : 0x00: 0x956 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + KND_MASK + ZERO + ZERO ,
-/*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + ZERO + ZERO + MLM_MASK + ZERO ,
+/*0x00 : 0x00: 0x957 */ ZERO + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + MLM_MASK + ZERO ,
/*0x00 : 0x00: 0x958 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x959 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x95a */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
-/*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ORI_MASK + ZERO + ZERO + ZERO + ZERO ,
+/*0x00 : 0x00: 0x95b */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x95c */ DEV_MASK + PNJ_MASK + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x95d */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x95e */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x961 */ DEV_MASK + ZERO + ZERO + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + ZERO ,
/*0x00 : 0x00: 0x962 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
/*0x00 : 0x00: 0x963 */ DEV_MASK + ZERO + ZERO + ZERO + BNG_MASK + ZERO + ZERO + ZERO ,
-/*0xea : 0xf8: 0x964 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
-/*0xeaea : 0x00: 0x965*/ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + ZERO + ZERO + ZERO ,
+/*0xea : 0xf8: 0x964 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
+/*0xeaea : 0x00: 0x965*/ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*0xf1 : 0xff: 0x966 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xf2 : 0xff: 0x967 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xf3 : 0xff: 0x968 */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xf8 : 0xff: 0x96d */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xf9 : 0xff: 0x96e */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
/*0xfa : 0xff: 0x96f */ DEV_MASK + PNJ_MASK + GJR_MASK + ORI_MASK + BNG_MASK + KND_MASK + MLM_MASK + TML_MASK ,
-/*0x00 : 0x80: 0x970 */ DEV_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
-
+/*0x00 : 0x80: 0x970 */ DEV_MASK + PNJ_MASK + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO ,
/*
* The length of the array is 128 to provide values for 0x900..0x97f.
* The last 15 entries for 0x971..0x97f of the validity table are all zero
/*0x00 : 0x00: 0x9yz */ ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO + ZERO
};
-static const uint16_t fromUnicodeTable[128]={
- 0x00a0 ,/* 0x0900 */
- 0x00a1 ,/* 0x0901 */
- 0x00a2 ,/* 0x0902 */
- 0x00a3 ,/* 0x0903 */
- 0xFFFF ,/* 0x0904 */
- 0x00a4 ,/* 0x0905 */
- 0x00a5 ,/* 0x0906 */
- 0x00a6 ,/* 0x0907 */
- 0x00a7 ,/* 0x0908 */
- 0x00a8 ,/* 0x0909 */
- 0x00a9 ,/* 0x090a */
- 0x00aa ,/* 0x090b */
- 0xA6E9 ,/* 0x090c */
- 0x00ae ,/* 0x090d */
- 0x00ab ,/* 0x090e */
- 0x00ac ,/* 0x090f */
- 0x00ad ,/* 0x0910 */
- 0x00b2 ,/* 0x0911 */
- 0x00af ,/* 0x0912 */
- 0x00b0 ,/* 0x0913 */
- 0x00b1 ,/* 0x0914 */
- 0x00b3 ,/* 0x0915 */
- 0x00b4 ,/* 0x0916 */
- 0x00b5 ,/* 0x0917 */
- 0x00b6 ,/* 0x0918 */
- 0x00b7 ,/* 0x0919 */
- 0x00b8 ,/* 0x091a */
- 0x00b9 ,/* 0x091b */
- 0x00ba ,/* 0x091c */
- 0x00bb ,/* 0x091d */
- 0x00bc ,/* 0x091e */
- 0x00bd ,/* 0x091f */
- 0x00be ,/* 0x0920 */
- 0x00bf ,/* 0x0921 */
- 0x00c0 ,/* 0x0922 */
- 0x00c1 ,/* 0x0923 */
- 0x00c2 ,/* 0x0924 */
- 0x00c3 ,/* 0x0925 */
- 0x00c4 ,/* 0x0926 */
- 0x00c5 ,/* 0x0927 */
- 0x00c6 ,/* 0x0928 */
- 0x00c7 ,/* 0x0929 */
- 0x00c8 ,/* 0x092a */
- 0x00c9 ,/* 0x092b */
- 0x00ca ,/* 0x092c */
- 0x00cb ,/* 0x092d */
- 0x00cc ,/* 0x092e */
- 0x00cd ,/* 0x092f */
- 0x00cf ,/* 0x0930 */
- 0x00d0 ,/* 0x0931 */
- 0x00d1 ,/* 0x0932 */
- 0x00d2 ,/* 0x0933 */
- 0x00d3 ,/* 0x0934 */
- 0x00d4 ,/* 0x0935 */
- 0x00d5 ,/* 0x0936 */
- 0x00d6 ,/* 0x0937 */
- 0x00d7 ,/* 0x0938 */
- 0x00d8 ,/* 0x0939 */
- 0xFFFF ,/* 0x093A */
- 0xFFFF ,/* 0x093B */
- 0x00e9 ,/* 0x093c */
- 0xEAE9 ,/* 0x093d */
- 0x00da ,/* 0x093e */
- 0x00db ,/* 0x093f */
- 0x00dc ,/* 0x0940 */
- 0x00dd ,/* 0x0941 */
- 0x00de ,/* 0x0942 */
- 0x00df ,/* 0x0943 */
- 0xDFE9 ,/* 0x0944 */
- 0x00e3 ,/* 0x0945 */
- 0x00e0 ,/* 0x0946 */
- 0x00e1 ,/* 0x0947 */
- 0x00e2 ,/* 0x0948 */
- 0x00e7 ,/* 0x0949 */
- 0x00e4 ,/* 0x094a */
- 0x00e5 ,/* 0x094b */
- 0x00e6 ,/* 0x094c */
- 0x00e8 ,/* 0x094d */
- 0x00ec ,/* 0x094e */
- 0x00ed ,/* 0x094f */
+static const uint16_t fromUnicodeTable[128]={
+ 0x00a0 ,/* 0x0900 */
+ 0x00a1 ,/* 0x0901 */
+ 0x00a2 ,/* 0x0902 */
+ 0x00a3 ,/* 0x0903 */
+ 0xa4e0 ,/* 0x0904 */
+ 0x00a4 ,/* 0x0905 */
+ 0x00a5 ,/* 0x0906 */
+ 0x00a6 ,/* 0x0907 */
+ 0x00a7 ,/* 0x0908 */
+ 0x00a8 ,/* 0x0909 */
+ 0x00a9 ,/* 0x090a */
+ 0x00aa ,/* 0x090b */
+ 0xA6E9 ,/* 0x090c */
+ 0x00ae ,/* 0x090d */
+ 0x00ab ,/* 0x090e */
+ 0x00ac ,/* 0x090f */
+ 0x00ad ,/* 0x0910 */
+ 0x00b2 ,/* 0x0911 */
+ 0x00af ,/* 0x0912 */
+ 0x00b0 ,/* 0x0913 */
+ 0x00b1 ,/* 0x0914 */
+ 0x00b3 ,/* 0x0915 */
+ 0x00b4 ,/* 0x0916 */
+ 0x00b5 ,/* 0x0917 */
+ 0x00b6 ,/* 0x0918 */
+ 0x00b7 ,/* 0x0919 */
+ 0x00b8 ,/* 0x091a */
+ 0x00b9 ,/* 0x091b */
+ 0x00ba ,/* 0x091c */
+ 0x00bb ,/* 0x091d */
+ 0x00bc ,/* 0x091e */
+ 0x00bd ,/* 0x091f */
+ 0x00be ,/* 0x0920 */
+ 0x00bf ,/* 0x0921 */
+ 0x00c0 ,/* 0x0922 */
+ 0x00c1 ,/* 0x0923 */
+ 0x00c2 ,/* 0x0924 */
+ 0x00c3 ,/* 0x0925 */
+ 0x00c4 ,/* 0x0926 */
+ 0x00c5 ,/* 0x0927 */
+ 0x00c6 ,/* 0x0928 */
+ 0x00c7 ,/* 0x0929 */
+ 0x00c8 ,/* 0x092a */
+ 0x00c9 ,/* 0x092b */
+ 0x00ca ,/* 0x092c */
+ 0x00cb ,/* 0x092d */
+ 0x00cc ,/* 0x092e */
+ 0x00cd ,/* 0x092f */
+ 0x00cf ,/* 0x0930 */
+ 0x00d0 ,/* 0x0931 */
+ 0x00d1 ,/* 0x0932 */
+ 0x00d2 ,/* 0x0933 */
+ 0x00d3 ,/* 0x0934 */
+ 0x00d4 ,/* 0x0935 */
+ 0x00d5 ,/* 0x0936 */
+ 0x00d6 ,/* 0x0937 */
+ 0x00d7 ,/* 0x0938 */
+ 0x00d8 ,/* 0x0939 */
+ 0xFFFF ,/* 0x093A */
+ 0xFFFF ,/* 0x093B */
+ 0x00e9 ,/* 0x093c */
+ 0xEAE9 ,/* 0x093d */
+ 0x00da ,/* 0x093e */
+ 0x00db ,/* 0x093f */
+ 0x00dc ,/* 0x0940 */
+ 0x00dd ,/* 0x0941 */
+ 0x00de ,/* 0x0942 */
+ 0x00df ,/* 0x0943 */
+ 0xDFE9 ,/* 0x0944 */
+ 0x00e3 ,/* 0x0945 */
+ 0x00e0 ,/* 0x0946 */
+ 0x00e1 ,/* 0x0947 */
+ 0x00e2 ,/* 0x0948 */
+ 0x00e7 ,/* 0x0949 */
+ 0x00e4 ,/* 0x094a */
+ 0x00e5 ,/* 0x094b */
+ 0x00e6 ,/* 0x094c */
+ 0x00e8 ,/* 0x094d */
+ 0x00ec ,/* 0x094e */
+ 0x00ed ,/* 0x094f */
0xA1E9 ,/* 0x0950 */ /* OM Symbol */
- 0xFFFF ,/* 0x0951 */
- 0xF0B8 ,/* 0x0952 */
- 0xFFFF ,/* 0x0953 */
- 0xFFFF ,/* 0x0954 */
- 0xFFFF ,/* 0x0955 */
- 0xFFFF ,/* 0x0956 */
- 0xFFFF ,/* 0x0957 */
- 0xb3e9 ,/* 0x0958 */
- 0xb4e9 ,/* 0x0959 */
- 0xb5e9 ,/* 0x095a */
- 0xbae9 ,/* 0x095b */
- 0xbfe9 ,/* 0x095c */
- 0xC0E9 ,/* 0x095d */
- 0xc9e9 ,/* 0x095e */
- 0x00ce ,/* 0x095f */
- 0xAAe9 ,/* 0x0960 */
- 0xA7E9 ,/* 0x0961 */
- 0xDBE9 ,/* 0x0962 */
- 0xDCE9 ,/* 0x0963 */
- 0x00ea ,/* 0x0964 */
- 0xeaea ,/* 0x0965 */
- 0x00f1 ,/* 0x0966 */
- 0x00f2 ,/* 0x0967 */
- 0x00f3 ,/* 0x0968 */
- 0x00f4 ,/* 0x0969 */
- 0x00f5 ,/* 0x096a */
- 0x00f6 ,/* 0x096b */
- 0x00f7 ,/* 0x096c */
- 0x00f8 ,/* 0x096d */
- 0x00f9 ,/* 0x096e */
- 0x00fa ,/* 0x096f */
+ 0xFFFF ,/* 0x0951 */
+ 0xF0B8 ,/* 0x0952 */
+ 0xFFFF ,/* 0x0953 */
+ 0xFFFF ,/* 0x0954 */
+ 0xFFFF ,/* 0x0955 */
+ 0xFFFF ,/* 0x0956 */
+ 0xFFFF ,/* 0x0957 */
+ 0xb3e9 ,/* 0x0958 */
+ 0xb4e9 ,/* 0x0959 */
+ 0xb5e9 ,/* 0x095a */
+ 0xbae9 ,/* 0x095b */
+ 0xbfe9 ,/* 0x095c */
+ 0xC0E9 ,/* 0x095d */
+ 0xc9e9 ,/* 0x095e */
+ 0x00ce ,/* 0x095f */
+ 0xAAe9 ,/* 0x0960 */
+ 0xA7E9 ,/* 0x0961 */
+ 0xDBE9 ,/* 0x0962 */
+ 0xDCE9 ,/* 0x0963 */
+ 0x00ea ,/* 0x0964 */
+ 0xeaea ,/* 0x0965 */
+ 0x00f1 ,/* 0x0966 */
+ 0x00f2 ,/* 0x0967 */
+ 0x00f3 ,/* 0x0968 */
+ 0x00f4 ,/* 0x0969 */
+ 0x00f5 ,/* 0x096a */
+ 0x00f6 ,/* 0x096b */
+ 0x00f7 ,/* 0x096c */
+ 0x00f8 ,/* 0x096d */
+ 0x00f9 ,/* 0x096e */
+ 0x00fa ,/* 0x096f */
0xF0BF ,/* 0x0970 */
0xFFFF ,/* 0x0971 */
0xFFFF ,/* 0x0972 */
0xFFFF /* 0xff */
};
+static const uint16_t vowelSignESpecialCases[][2]={
+ { 2 /*length of array*/ , 0 },
+ { 0xA4 , 0x0904 },
+};
+
static const uint16_t nuktaSpecialCases[][2]={
{ 16 /*length of array*/ , 0 },
{ 0xA6 , 0x090c },
{ 0xAA , 0x0960 },
{ 0xA7 , 0x0961 },
{ 0xDB , 0x0962 },
- { 0xDC , 0x0963 },
+ { 0xDC , 0x0963 },
};
-#define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
+
+#define WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err){ \
+ int32_t offset = (int32_t)(source - args->source-1); \
/* write the targetUniChar to target */ \
- if(target <targetLimit){ \
+ if(target < targetLimit){ \
if(targetByteUnit <= 0xFF){ \
*(target)++ = (uint8_t)(targetByteUnit); \
if(offsets){ \
- *(offsets++) = (int32_t)(source - args->source-1); \
+ *(offsets++) = offset; \
} \
}else{ \
- *(target)++ = (uint8_t)(targetByteUnit>>8); \
- if(offsets){ \
- *(offsets++) = (int32_t)(source - args->source-1); \
- } \
- if(target < targetLimit){ \
- *(target)++ = (uint8_t) targetByteUnit; \
- if(offsets){ \
- *(offsets++) = (int32_t)(source - args->source-1); \
+ if (targetByteUnit > 0xFFFF) { \
+ *(target)++ = (uint8_t)(targetByteUnit>>16); \
+ if (offsets) { \
+ --offset; \
+ *(offsets++) = offset; \
} \
- }else{ \
+ } \
+ if (!(target < targetLimit)) { \
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
+ (uint8_t)(targetByteUnit >> 8); \
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
- (uint8_t) (targetByteUnit); \
+ (uint8_t)targetByteUnit; \
*err = U_BUFFER_OVERFLOW_ERROR; \
+ } else { \
+ *(target)++ = (uint8_t)(targetByteUnit>>8); \
+ if(offsets){ \
+ *(offsets++) = offset; \
+ } \
+ if(target < targetLimit){ \
+ *(target)++ = (uint8_t) targetByteUnit; \
+ if(offsets){ \
+ *(offsets++) = offset ; \
+ } \
+ }else{ \
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] =\
+ (uint8_t) (targetByteUnit); \
+ *err = U_BUFFER_OVERFLOW_ERROR; \
+ } \
} \
} \
}else{ \
+ if (targetByteUnit & 0xFF0000) { \
+ args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
+ (uint8_t) (targetByteUnit >>16); \
+ } \
if(targetByteUnit & 0xFF00){ \
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = \
(uint8_t) (targetByteUnit >>8); \
(uint8_t) (targetByteUnit); \
*err = U_BUFFER_OVERFLOW_ERROR; \
} \
-}
+}
/* Rules:
- * Explicit Halant :
+ * Explicit Halant :
* <HALANT> + <ZWNJ>
* Soft Halant :
- * <HALANT> + <ZWJ>
+ * <HALANT> + <ZWJ>
*/
-static void
-UConverter_fromUnicode_ISCII_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
- UErrorCode * err){
+static void UConverter_fromUnicode_ISCII_OFFSETS_LOGIC(
+ UConverterFromUnicodeArgs * args, UErrorCode * err) {
const UChar *source = args->source;
const UChar *sourceLimit = args->sourceLimit;
unsigned char *target = (unsigned char *) args->target;
int32_t* offsets = args->offsets;
uint32_t targetByteUnit = 0x0000;
UChar32 sourceChar = 0x0000;
- UConverterCallbackReason reason;
- UBool useFallback;
+ UChar32 tempContextFromUnicode = 0x0000; /* For special handling of the Gurmukhi script. */
UConverterDataISCII *converterData;
uint16_t newDelta=0;
uint16_t range = 0;
UBool deltaChanged = FALSE;
- if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)){
+ if ((args->converter == NULL) || (args->targetLimit < args->target) || (args->sourceLimit < args->source)) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
/* initialize data */
converterData=(UConverterDataISCII*)args->converter->extraInfo;
- useFallback = args->converter->useFallback;
newDelta=converterData->currentDeltaFromUnicode;
range = (uint16_t)(newDelta/DELTA);
-
- if(args->converter->fromUSurrogateLead!=0 && target <targetLimit) {
+
+ if ((sourceChar = args->converter->fromUChar32)!=0) {
goto getTrail;
}
/*writing the char to the output stream */
- while(source < sourceLimit){
-
- targetByteUnit = missingCharMarker;
-
+ while (source < sourceLimit) {
+ /* Write the language code following LF only if LF is not the last character. */
+ if (args->converter->fromUnicodeStatus == LF) {
+ targetByteUnit = ATR<<8;
+ targetByteUnit += (uint8_t) lookupInitialData[range].isciiLang;
+ args->converter->fromUnicodeStatus = 0x0000;
+ /* now append ATR and language code */
+ WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
+ if (U_FAILURE(*err)) {
+ break;
+ }
+ }
+
sourceChar = *source++;
-
+ tempContextFromUnicode = converterData->contextCharFromUnicode;
+
+ targetByteUnit = missingCharMarker;
+
/*check if input is in ASCII and C0 control codes range*/
if (sourceChar <= ASCII_END) {
+ args->converter->fromUnicodeStatus = sourceChar;
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,sourceChar,err);
- if(U_FAILURE(*err)){
+ if (U_FAILURE(*err)) {
break;
}
- if(sourceChar == LF){
- targetByteUnit = ATR<<8;
- targetByteUnit += (uint8_t) lookupInitialData[range][2];
- args->converter->fromUnicodeStatus=sourceChar;
- /* now append ATR and language code */
- WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
- if(U_FAILURE(*err)){
- break;
- }
- }
continue;
}
- switch(sourceChar){
+ switch (sourceChar) {
case ZWNJ:
/* contextChar has HALANT */
- if(converterData->contextCharFromUnicode){
+ if (converterData->contextCharFromUnicode) {
converterData->contextCharFromUnicode = 0x00;
targetByteUnit = ISCII_HALANT;
- }else{
+ } else {
/* consume ZWNJ and continue */
converterData->contextCharFromUnicode = 0x00;
continue;
break;
case ZWJ:
/* contextChar has HALANT */
- if(converterData->contextCharFromUnicode){
- targetByteUnit = ISCII_NUKTA;
- }else{
+ if (converterData->contextCharFromUnicode) {
+ targetByteUnit = ISCII_NUKTA;
+ } else {
targetByteUnit =ISCII_INV;
}
converterData->contextCharFromUnicode = 0x00;
break;
- default:
+ default:
/* is the sourceChar in the INDIC_RANGE? */
- if((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE){
- /* Danda and Double Danda are valid in Northern scripts.. since Unicode
- * does not include these codepoints in all Northern scrips we need to
+ if ((uint16_t)(INDIC_BLOCK_END-sourceChar) <= INDIC_RANGE) {
+ /* Danda and Double Danda are valid in Northern scripts.. since Unicode
+ * does not include these codepoints in all Northern scrips we need to
* filter them out
*/
- if(sourceChar!= DANDA && sourceChar != DOUBLE_DANDA){
- /* find out to which block the souceChar belongs*/
+ if (sourceChar!= DANDA && sourceChar != DOUBLE_DANDA) {
+ /* find out to which block the souceChar belongs*/
range =(uint16_t)((sourceChar-INDIC_BLOCK_BEGIN)/DELTA);
newDelta =(uint16_t)(range*DELTA);
/* Now are we in the same block as the previous? */
- if(newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer){
+ if (newDelta!= converterData->currentDeltaFromUnicode || converterData->isFirstBuffer) {
converterData->currentDeltaFromUnicode = newDelta;
- converterData->currentMaskFromUnicode = lookupInitialData[range][1];
+ converterData->currentMaskFromUnicode = lookupInitialData[range].maskEnum;
deltaChanged =TRUE;
converterData->isFirstBuffer=FALSE;
}
+
+ if (converterData->currentDeltaFromUnicode == PNJ_DELTA) {
+ if (sourceChar == PNJ_TIPPI) {
+ /* Make sure Tippi is converterd to Bindi. */
+ sourceChar = PNJ_BINDI;
+ } else if (sourceChar == PNJ_ADHAK) {
+ /* This is for consonant cluster handling. */
+ converterData->contextCharFromUnicode = PNJ_ADHAK;
+ }
+
+ }
/* Normalize all Indic codepoints to Devanagari and map them to ISCII */
/* now subtract the new delta from sourceChar*/
- sourceChar -= converterData->currentDeltaFromUnicode ;
+ sourceChar -= converterData->currentDeltaFromUnicode;
}
- /* get the target byte unit */
+ /* get the target byte unit */
targetByteUnit=fromUnicodeTable[(uint8_t)sourceChar];
-
+
/* is the code point valid in current script? */
- if((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0){
- /* Vocallic RR is assigne in ISCII Telugu and Unicode */
- if(converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) && sourceChar!=VOCALLIC_RR){
+ if ((validityTable[(uint8_t)sourceChar] & converterData->currentMaskFromUnicode)==0) {
+ /* Vocallic RR is assigned in ISCII Telugu and Unicode */
+ if (converterData->currentDeltaFromUnicode!=(TELUGU_DELTA) || sourceChar!=VOCALLIC_RR) {
targetByteUnit=missingCharMarker;
}
}
-
- if(deltaChanged){
- /* we are in a script block which is different than
- * previous sourceChar's script block write ATR and language codes
+
+ if (deltaChanged) {
+ /* we are in a script block which is different than
+ * previous sourceChar's script block write ATR and language codes
*/
- uint16_t temp=0;
+ uint32_t temp=0;
temp =(uint16_t)(ATR<<8);
- temp += (uint16_t)((uint8_t) lookupInitialData[range][2]);
+ temp += (uint16_t)((uint8_t) lookupInitialData[range].isciiLang);
/* reset */
deltaChanged=FALSE;
/* now append ATR and language code */
WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,temp,err);
- if(U_FAILURE(*err)){
+ if (U_FAILURE(*err)) {
break;
}
}
+
+ if (converterData->currentDeltaFromUnicode == PNJ_DELTA && (sourceChar + PNJ_DELTA) == PNJ_ADHAK) {
+ continue;
+ }
}
/* reset context char */
converterData->contextCharFromUnicode = 0x00;
break;
}
-
-
- if(targetByteUnit != missingCharMarker){
- if(targetByteUnit==ISCII_HALANT){
+ if (converterData->currentDeltaFromUnicode == PNJ_DELTA && tempContextFromUnicode == PNJ_ADHAK && isPNJConsonant((sourceChar + PNJ_DELTA))) {
+ /* If the previous codepoint is Adhak and the current codepoint is a consonant, the targetByteUnit should be C + Halant + C. */
+ /* reset context char */
+ converterData->contextCharFromUnicode = 0x0000;
+ targetByteUnit = targetByteUnit << 16 | ISCII_HALANT << 8 | targetByteUnit;
+ /* write targetByteUnit to target */
+ WRITE_TO_TARGET_FROM_U(args, offsets, source, target, targetLimit, targetByteUnit,err);
+ if (U_FAILURE(*err)) {
+ break;
+ }
+ } else if (targetByteUnit != missingCharMarker) {
+ if (targetByteUnit==ISCII_HALANT) {
converterData->contextCharFromUnicode = (UChar)targetByteUnit;
}
- /* write targetByteUnit to target*/
- WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
- if(U_FAILURE(*err)){
- break;
- }
- }
- else{
- /* oops.. the code point is unassingned
- * set the error and reason
- */
- reason =UCNV_UNASSIGNED;
- *err =U_INVALID_CHAR_FOUND;
-
+ /* write targetByteUnit to target*/
+ WRITE_TO_TARGET_FROM_U(args,offsets,source,target,targetLimit,targetByteUnit,err);
+ if (U_FAILURE(*err)) {
+ break;
+ }
+ } else {
+ /* oops.. the code point is unassigned */
/*check if the char is a First surrogate*/
- if(UTF_IS_SURROGATE(sourceChar)) {
- if(UTF_IS_SURROGATE_FIRST(sourceChar)) {
- args->converter->fromUSurrogateLead=(UChar)sourceChar;
+ if (U16_IS_SURROGATE(sourceChar)) {
+ if (U16_IS_SURROGATE_LEAD(sourceChar)) {
getTrail:
/*look ahead to find the trail surrogate*/
- if(source < sourceLimit) {
+ if (source < sourceLimit) {
/* test the following code unit */
UChar trail= (*source);
- if(UTF_IS_SECOND_SURROGATE(trail)) {
+ if (U16_IS_TRAIL(trail)) {
source++;
- sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
- args->converter->fromUSurrogateLead=0x00;
- reason =UCNV_UNASSIGNED;
+ sourceChar=U16_GET_SUPPLEMENTARY(sourceChar, trail);
*err =U_INVALID_CHAR_FOUND;
/* convert this surrogate code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
- sourceChar = args->converter->fromUSurrogateLead;
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
} else {
/* no more input */
*err = U_ZERO_ERROR;
- break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
- reason=UCNV_ILLEGAL;
*err=U_ILLEGAL_CHAR_FOUND;
}
+ } else {
+ /* callback(unassigned) for a BMP code point */
+ *err = U_INVALID_CHAR_FOUND;
}
- {
- /*variables for callback */
- const UChar* saveSource =NULL;
- char* saveTarget =NULL;
- int32_t* saveOffsets =NULL;
- int currentOffset =0;
- int32_t saveIndex =0;
-
- args->converter->invalidUCharLength = 0;
-
- if(sourceChar>0xffff){
- /* we have got a surrogate pair... dissable and populate the invalidUCharBuffer */
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
- =(uint16_t)(((sourceChar)>>10)+0xd7c0);
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
- =(uint16_t)(((sourceChar)&0x3ff)|0xdc00);
- }
- else{
- args->converter->invalidUCharBuffer[args->converter->invalidUCharLength++]
- =(UChar)sourceChar;
- }
-
- if(offsets){
- currentOffset = *(offsets-1)+1;
- }
- saveSource = args->source;
- saveTarget = args->target;
- saveOffsets = args->offsets;
- args->target = (char*)target;
- args->source = source;
- args->offsets = offsets;
-
- /*copies current values for the ErrorFunctor to update */
- /*Calls the ErrorFunctor */
- args->converter->fromUCharErrorBehaviour ( args->converter->fromUContext,
- args,
- args->converter->invalidUCharBuffer,
- args->converter->invalidUCharLength,
- (UChar32) (sourceChar),
- reason,
- err);
-
- saveIndex = (int32_t)(args->target - (char*)target);
- if(args->offsets){
- args->offsets = saveOffsets;
- while(saveIndex-->0){
- *offsets = currentOffset;
- offsets++;
- }
- }
- target = (unsigned char*)args->target;
- args->source=saveSource;
- args->target=saveTarget;
- args->offsets=saveOffsets;
- args->converter->fromUSurrogateLead=0x00;
- if (U_FAILURE (*err)){
- break;
- }
- }
+ args->converter->fromUChar32=sourceChar;
+ break;
}
-
-
}/* end while(mySourceIndex<mySourceLength) */
-
- /*If at the end of conversion we are still carrying state information
- *flush is TRUE, we can deduce that the input stream is truncated
- */
- if (args->converter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
- *err = U_TRUNCATED_CHAR_FOUND;
- }
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (source == sourceLimit) && args->flush){
- /*reset converter*/
- _ISCIIReset(args->converter,UCNV_RESET_FROM_UNICODE);
- }
-
/*save the state and return */
args->source = source;
args->target = (char*)target;
}
-static const int32_t lookupTable[][2]={
+static const uint16_t lookupTable[][2]={
{ ZERO, ZERO }, /*DEFALT*/
{ ZERO, ZERO }, /*ROMAN*/
{ DEVANAGARI, DEV_MASK },
{ BENGALI, BNG_MASK },
{ ORIYA, ORI_MASK },
{ KANNADA, KND_MASK },
+ { MALAYALAM, MLM_MASK },
{ GUJARATI, GJR_MASK },
- { GURMUKHI, PNJ_MASK },
+ { GURMUKHI, PNJ_MASK }
};
#define WRITE_TO_TARGET_TO_U(args,source,target,offsets,offset,targetUniChar,delta, err){\
(UChar)targetUniChar; \
*err = U_BUFFER_OVERFLOW_ERROR; \
} \
-}
-
+}
+
#define GET_MAPPING(sourceChar,targetUniChar,data){ \
targetUniChar = toUnicodeTable[(sourceChar)] ; \
/* is the code point valid in current script? */ \
if(sourceChar> ASCII_END && \
- (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode)==0){ \
+ (validityTable[(targetUniChar & 0x7F)] & data->currentMaskToUnicode)==0){ \
/* Vocallic RR is assigne in ISCII Telugu and Unicode */ \
- if(data->currentDeltaToUnicode!=(TELUGU_DELTA) && \
+ if(data->currentDeltaToUnicode!=(TELUGU_DELTA) || \
targetUniChar!=VOCALLIC_RR){ \
targetUniChar=missingCharMarker; \
} \
/***********
* Rules for ISCII to Unicode converter
* ISCII is stateful encoding. To convert ISCII bytes to Unicode,
- * which has both precomposed and decomposed forms characters
+ * which has both precomposed and decomposed forms characters
* pre-context and post-context need to be considered.
- *
+ *
* Post context
- * i) ATR : Attribute code is used to declare the font and script switching.
+ * i) ATR : Attribute code is used to declare the font and script switching.
* Currently we only switch scripts and font codes consumed without generating an error
- * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
+ * ii) EXT : Extention code is used to declare switching to Sanskrit and for obscure,
* obsolete characters
- * Pre context
+ * Pre context
* i) Halant: if preceeded by a halant then it is a explicit halant
- * ii) Nukta :
+ * ii) Nukta :
* a) if preceeded by a halant then it is a soft halant
* b) if preceeded by specific consonants and the ligatures have pre-composed
* characters in Unicode then convert to pre-composed characters
* iii) Danda: If Danda is preceeded by a Danda then convert to Double Danda
- *
+ *
*/
-static void
-UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
- UErrorCode* err){
+static void UConverter_toUnicode_ISCII_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err) {
const char *source = ( char *) args->source;
UChar *target = args->target;
const char *sourceLimit = args->sourceLimit;
uint32_t targetUniChar = 0x0000;
uint8_t sourceChar = 0x0000;
UConverterDataISCII* data;
- UConverterCallbackReason reason;
UChar32* toUnicodeStatus=NULL;
- UChar* contextCharToUnicode = NULL;
+ UChar32 tempTargetUniChar = 0x0000;
+ UChar* contextCharToUnicode= NULL;
+ UBool found;
+ int i;
+ int offset = 0;
- if ((args->converter == NULL) || (target < args->target) || (source < args->source)){
+ if ((args->converter == NULL) || (target < args->target) || (source < args->source)) {
*err = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
-
+
data = (UConverterDataISCII*)(args->converter->extraInfo);
contextCharToUnicode = &data->contextCharToUnicode; /* contains previous ISCII codepoint visited */
toUnicodeStatus = (UChar32*)&args->converter->toUnicodeStatus;/* contains the mapping to Unicode of the above codepoint*/
- while(source<sourceLimit){
+ while (U_SUCCESS(*err) && source<sourceLimit) {
targetUniChar = missingCharMarker;
-
- if(target < targetLimit){
+
+ if (target < targetLimit) {
sourceChar = (unsigned char)*(source)++;
/* look at the post-context preform special processing */
- if(*contextCharToUnicode==ATR){
-
+ if (*contextCharToUnicode==ATR) {
+
/* If we have ATR in *contextCharToUnicode then we need to change our
* state to the Indic Script specified by sourceChar
*/
/* check if the sourceChar is supported script range*/
- if((uint8_t)(PNJ-sourceChar)<=PNJ-DEV){
- data->currentDeltaToUnicode =
- (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA);
- data->currentMaskToUnicode =
- lookupTable[sourceChar & 0x0F][1] ;
- }
- else if(sourceChar==DEF){
+ if ((uint8_t)(PNJ-sourceChar)<=PNJ-DEV) {
+ data->currentDeltaToUnicode = (uint16_t)(lookupTable[sourceChar & 0x0F][0] * DELTA);
+ data->currentMaskToUnicode = (MaskEnum)lookupTable[sourceChar & 0x0F][1];
+ } else if (sourceChar==DEF) {
/* switch back to default */
data->currentDeltaToUnicode = data->defDeltaToUnicode;
data->currentMaskToUnicode = data->defMaskToUnicode;
- }else{
-
- if((sourceChar >= 0x21 && sourceChar <= 0x3F)){
+ } else {
+ if ((sourceChar >= 0x21 && sourceChar <= 0x3F)) {
/* these are display codes consume and continue */
- }else{
+ } else {
*err =U_ILLEGAL_CHAR_FOUND;
/* reset */
*contextCharToUnicode=NO_CHAR_MARKER;
- reason = UCNV_ILLEGAL;
goto CALLBACK;
}
-
}
/* reset */
- *contextCharToUnicode=NO_CHAR_MARKER;
-
+ *contextCharToUnicode=NO_CHAR_MARKER;
+
continue;
- }else if(*contextCharToUnicode==EXT){
+ } else if (*contextCharToUnicode==EXT) {
/* check if sourceChar is in 0xA1-0xEE range */
- if((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)){
+ if ((uint8_t) (EXT_RANGE_END - sourceChar) <= (EXT_RANGE_END - EXT_RANGE_BEGIN)) {
/* We currently support only Anudatta and Devanagari abbreviation sign */
- if(sourceChar==0xBF || sourceChar == 0xB8){
+ if (sourceChar==0xBF || sourceChar == 0xB8) {
targetUniChar = (sourceChar==0xBF) ? DEV_ABBR_SIGN : DEV_ANUDATTA;
- /* find out if the mapping is valid in this state */
- if(validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode){
-
+ /* find out if the mapping is valid in this state */
+ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
*contextCharToUnicode= NO_CHAR_MARKER;
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
/* write to target */
- WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),
- targetUniChar,data->currentDeltaToUnicode,err);
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
continue;
}
/* byte unit is unassigned */
targetUniChar = missingCharMarker;
*err= U_INVALID_CHAR_FOUND;
- reason = UCNV_UNASSIGNED;
- }else{
+ } else {
/* only 0xA1 - 0xEE are legal after EXT char */
*contextCharToUnicode= NO_CHAR_MARKER;
- reason= UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
goto CALLBACK;
- }else if(*contextCharToUnicode==ISCII_INV){
- if(sourceChar==ISCII_HALANT){
+ } else if (*contextCharToUnicode==ISCII_INV) {
+ if (sourceChar==ISCII_HALANT) {
targetUniChar = 0x0020; /* replace with space accoding to Indic FAQ */
- }else{
+ } else {
targetUniChar = ZWJ;
}
-
+
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
/* write to target */
- WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),
- targetUniChar,data->currentDeltaToUnicode,err);
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
/* reset */
*contextCharToUnicode=NO_CHAR_MARKER;
}
/* look at the pre-context and perform special processing */
- switch(sourceChar){
+ switch (sourceChar) {
case ISCII_INV:
- case EXT: /*falls through*/
+ case EXT:
case ATR:
*contextCharToUnicode = (UChar)sourceChar;
-
- if(*toUnicodeStatus != missingCharMarker){
- WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),
- *toUnicodeStatus,data->currentDeltaToUnicode,err);
+ if (*toUnicodeStatus != missingCharMarker) {
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
*toUnicodeStatus = missingCharMarker;
}
continue;
case ISCII_DANDA:
/* handle double danda*/
- if(*contextCharToUnicode== ISCII_DANDA){
+ if (*contextCharToUnicode== ISCII_DANDA) {
targetUniChar = DOUBLE_DANDA;
/* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER;
*toUnicodeStatus = missingCharMarker;
- }else{
+ } else {
GET_MAPPING(sourceChar,targetUniChar,data);
*contextCharToUnicode = sourceChar;
}
break;
case ISCII_HALANT:
/* handle explicit halant */
- if(*contextCharToUnicode == ISCII_HALANT){
+ if (*contextCharToUnicode == ISCII_HALANT) {
targetUniChar = ZWNJ;
/* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER;
- }else{
+ } else {
GET_MAPPING(sourceChar,targetUniChar,data);
*contextCharToUnicode = sourceChar;
}
break;
+ case 0x0A:
+ case 0x0D:
+ data->resetToDefaultToUnicode = TRUE;
+ GET_MAPPING(sourceChar,targetUniChar,data)
+ ;
+ *contextCharToUnicode = sourceChar;
+ break;
+
+ case ISCII_VOWEL_SIGN_E:
+ i=1;
+ found=FALSE;
+ for (; i<vowelSignESpecialCases[0][0]; i++) {
+ U_ASSERT(i<UPRV_LENGTHOF(vowelSignESpecialCases));
+ if (vowelSignESpecialCases[i][0]==(uint8_t)*contextCharToUnicode) {
+ targetUniChar=vowelSignESpecialCases[i][1];
+ found=TRUE;
+ break;
+ }
+ }
+ if (found) {
+ /* find out if the mapping is valid in this state */
+ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
+ /*targetUniChar += data->currentDeltaToUnicode ;*/
+ *contextCharToUnicode= NO_CHAR_MARKER;
+ *toUnicodeStatus = missingCharMarker;
+ break;
+ }
+ }
+ GET_MAPPING(sourceChar,targetUniChar,data);
+ *contextCharToUnicode = sourceChar;
+ break;
+
case ISCII_NUKTA:
/* handle soft halant */
- if(*contextCharToUnicode == ISCII_HALANT){
+ if (*contextCharToUnicode == ISCII_HALANT) {
targetUniChar = ZWJ;
/* clear the context */
*contextCharToUnicode = NO_CHAR_MARKER;
break;
- }else{
+ } else if (data->currentDeltaToUnicode == PNJ_DELTA && data->contextCharToUnicode == 0xc0) {
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
+ /* We got here because ISCII_NUKTA was preceded by 0xc0 and we are converting Gurmukhi.
+ * In that case we must convert (0xc0 0xe9) to (\u0a5c\u0a4d\u0a39).
+ */
+ targetUniChar = PNJ_RRA;
+ WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
+ if (U_SUCCESS(*err)) {
+ targetUniChar = PNJ_SIGN_VIRAMA;
+ WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
+ if (U_SUCCESS(*err)) {
+ targetUniChar = PNJ_HA;
+ WRITE_TO_TARGET_TO_U(args, source, target, args->offsets, (source-args->source)-2, targetUniChar, 0, err);
+ } else {
+ args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
+ }
+ } else {
+ args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_SIGN_VIRAMA;
+ args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]= PNJ_HA;
+ }
+ *toUnicodeStatus = missingCharMarker;
+ data->contextCharToUnicode = NO_CHAR_MARKER;
+ continue;
+ } else {
/* try to handle <CHAR> + ISCII_NUKTA special mappings */
- int i=1;
- UBool found =FALSE;
- for( ;i<nuktaSpecialCases[0][0];i++){
- if(nuktaSpecialCases[i][0]==(uint8_t)*contextCharToUnicode){
+ i=1;
+ found =FALSE;
+ for (; i<nuktaSpecialCases[0][0]; i++) {
+ if (nuktaSpecialCases[i][0]==(uint8_t)
+ *contextCharToUnicode) {
targetUniChar=nuktaSpecialCases[i][1];
found =TRUE;
break;
}
}
- if(found){
- /* find out if the mapping is valid in this state */
- if(validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode){
- targetUniChar += data->currentDeltaToUnicode ;
+ if (found) {
+ /* find out if the mapping is valid in this state */
+ if (validityTable[(uint8_t)targetUniChar] & data->currentMaskToUnicode) {
+ /*targetUniChar += data->currentDeltaToUnicode ;*/
*contextCharToUnicode= NO_CHAR_MARKER;
*toUnicodeStatus = missingCharMarker;
+ if (data->currentDeltaToUnicode == PNJ_DELTA) {
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),targetUniChar,data->currentDeltaToUnicode,err);
+ continue;
+ }
break;
}
/* else fall through to default */
}
/* else fall through to default */
+ U_FALLTHROUGH;
}
- default:
- GET_MAPPING(sourceChar,targetUniChar,data);
+ default:GET_MAPPING(sourceChar,targetUniChar,data)
+ ;
*contextCharToUnicode = sourceChar;
break;
}
-
- if(*toUnicodeStatus != missingCharMarker){
- /* write the previously mapped codepoint */
- WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),
- *toUnicodeStatus,data->currentDeltaToUnicode,err);
+ if (*toUnicodeStatus != missingCharMarker) {
+ /* Check to make sure that consonant clusters are handled correct for Gurmukhi script. */
+ if (data->currentDeltaToUnicode == PNJ_DELTA && data->prevToUnicodeStatus != 0 && isPNJConsonant(data->prevToUnicodeStatus) &&
+ (*toUnicodeStatus + PNJ_DELTA) == PNJ_SIGN_VIRAMA && (targetUniChar + PNJ_DELTA) == data->prevToUnicodeStatus) {
+ /* Consonant clusters C + HALANT + C should be encoded as ADHAK + C */
+ offset = (int)(source-args->source - 3);
+ tempTargetUniChar = PNJ_ADHAK; /* This is necessary to avoid some compiler warnings. */
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,tempTargetUniChar,0,err);
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,offset,data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000; /* reset the previous unicode code point */
+ *toUnicodeStatus = missingCharMarker;
+ continue;
+ } else {
+ /* Write the previous toUnicodeStatus, this was delayed to handle consonant clustering for Gurmukhi script. */
+ if (data->prevToUnicodeStatus) {
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -1),data->prevToUnicodeStatus,0,err);
+ data->prevToUnicodeStatus = 0x0000;
+ }
+ /* Check to make sure that Bindi and Tippi are handled correctly for Gurmukhi script.
+ * If 0xA2 is preceded by a codepoint in the PNJ_BINDI_TIPPI_SET then the target codepoint should be Tippi instead of Bindi.
+ */
+ if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_BINDI && isPNJBindiTippi((*toUnicodeStatus + PNJ_DELTA))) {
+ targetUniChar = PNJ_TIPPI - PNJ_DELTA;
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,PNJ_DELTA,err);
+ } else if (data->currentDeltaToUnicode == PNJ_DELTA && (targetUniChar + PNJ_DELTA) == PNJ_SIGN_VIRAMA && isPNJConsonant((*toUnicodeStatus + PNJ_DELTA))) {
+ /* Store the current toUnicodeStatus code point for later handling of consonant cluster in Gurmukhi. */
+ data->prevToUnicodeStatus = *toUnicodeStatus + PNJ_DELTA;
+ } else {
+ /* write the previously mapped codepoint */
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source-args->source -2),*toUnicodeStatus,data->currentDeltaToUnicode,err);
+ }
+ }
*toUnicodeStatus = missingCharMarker;
}
-
- if(targetUniChar != missingCharMarker ){
+ if (targetUniChar != missingCharMarker) {
/* now save the targetUniChar for delayed write */
*toUnicodeStatus = (UChar) targetUniChar;
- }else{
-
- /* we reach here only if targetUniChar == missingCharMarker
+ if (data->resetToDefaultToUnicode==TRUE) {
+ data->currentDeltaToUnicode = data->defDeltaToUnicode;
+ data->currentMaskToUnicode = data->defMaskToUnicode;
+ data->resetToDefaultToUnicode=FALSE;
+ }
+ } else {
+
+ /* we reach here only if targetUniChar == missingCharMarker
* so assign codes to reason and err
*/
- reason = UCNV_UNASSIGNED;
*err = U_INVALID_CHAR_FOUND;
CALLBACK:
- {
- const char *saveSource = args->source;
- UChar *saveTarget = args->target;
- int32_t *saveOffsets = NULL;
- int32_t currentOffset = (int32_t)(source - args->source -1);
- int32_t saveIndex = (int32_t)(target - args->target);
-
- args->converter->invalidCharLength=0;
-
- args->converter->invalidCharBuffer[args->converter->invalidCharLength++] =
- (char) sourceChar;
-
- if(args->offsets){
- saveOffsets=args->offsets;
- args->offsets = args->offsets+(target - args->target);
- }
-
- args->target =target;
- target =saveTarget;
- args->source = source;
-
- args->converter->fromCharErrorBehaviour (
- args->converter->toUContext,
- args,
- args->converter->invalidCharBuffer,
- args->converter->invalidCharLength,
- reason,
- err);
-
- if(args->offsets){
- args->offsets = saveOffsets;
-
- for (;saveIndex < (args->target - target);saveIndex++) {
- *(args->offsets)++ = currentOffset;
- }
- }
- target=args->target;
- args->source = saveSource;
- args->target = saveTarget;
- }
+ args->converter->toUBytes[0] = (uint8_t) sourceChar;
+ args->converter->toULength = 1;
+ break;
}
- }
- else{
+ } else {
*err =U_BUFFER_OVERFLOW_ERROR;
break;
}
}
- if((args->flush==TRUE)
- && (source == sourceLimit)
- && data->contextCharToUnicode != NO_CHAR_MARKER){
- /* if we have ATR in context it is an error */
- if(data->contextCharToUnicode==ATR || data->contextCharToUnicode==EXT || *toUnicodeStatus == missingCharMarker){
- *err = U_TRUNCATED_CHAR_FOUND;
- }else{
- WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),
- *toUnicodeStatus,data->currentDeltaToUnicode,err);
- *toUnicodeStatus = missingCharMarker;
+
+ if (U_SUCCESS(*err) && args->flush && source == sourceLimit) {
+ /* end of the input stream */
+ UConverter *cnv = args->converter;
+
+ if (*contextCharToUnicode==ATR || *contextCharToUnicode==EXT || *contextCharToUnicode==ISCII_INV) {
+ /* set toUBytes[] */
+ cnv->toUBytes[0] = (uint8_t)*contextCharToUnicode;
+ cnv->toULength = 1;
+
+ /* avoid looping on truncated sequences */
+ *contextCharToUnicode = NO_CHAR_MARKER;
+ } else {
+ cnv->toULength = 0;
}
+ if (*toUnicodeStatus != missingCharMarker) {
+ /* output a remaining target character */
+ WRITE_TO_TARGET_TO_U(args,source,target,args->offsets,(source - args->source -1),*toUnicodeStatus,data->currentDeltaToUnicode,err);
+ *toUnicodeStatus = missingCharMarker;
+ }
}
- /* Reset the state of converter if we consumed
- * the source and flush is true
- */
- if( (source == sourceLimit) && args->flush){
- /*reset converter*/
- _ISCIIReset(args->converter,UCNV_RESET_TO_UNICODE);
- }
+
args->target = target;
args->source = source;
}
/* structure for SafeClone calculations */
-struct cloneStruct
-{
+struct cloneISCIIStruct {
UConverter cnv;
UConverterDataISCII mydata;
};
-
-static UConverter *
-_ISCII_SafeClone(const UConverter *cnv,
- void *stackBuffer,
- int32_t *pBufferSize,
+static UConverter *
+_ISCII_SafeClone(const UConverter *cnv,
+ void *stackBuffer,
+ int32_t *pBufferSize,
UErrorCode *status)
{
- struct cloneStruct * localClone;
- int32_t bufferSizeNeeded = sizeof(struct cloneStruct);
+ struct cloneISCIIStruct * localClone;
+ int32_t bufferSizeNeeded = sizeof(struct cloneISCIIStruct);
- if (U_FAILURE(*status)){
+ if (U_FAILURE(*status)) {
return 0;
}
- if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */
+ if (*pBufferSize == 0) { /* 'preflighting' request - set needed size into *pBufferSize */
*pBufferSize = bufferSizeNeeded;
return 0;
}
- localClone = (struct cloneStruct *)stackBuffer;
- uprv_memcpy(&localClone->cnv, cnv, sizeof(UConverter));
- localClone->cnv.isCopyLocal = TRUE;
+ localClone = (struct cloneISCIIStruct *)stackBuffer;
+ /* ucnv.c/ucnv_safeClone() copied the main UConverter already */
uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(UConverterDataISCII));
localClone->cnv.extraInfo = &localClone->mydata;
static void
_ISCIIGetUnicodeSet(const UConverter *cnv,
- USet *set,
+ const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode)
{
/* Since all ISCII versions allow switching to other ISCII
scripts, we add all roundtrippable characters to this set. */
- uset_addRange(set, 0, ASCII_END);
+ sa->addRange(sa->set, 0, ASCII_END);
for (script = DEVANAGARI; script <= MALAYALAM; script++) {
- mask = (uint8_t)(lookupInitialData[script][1]);
+ mask = (uint8_t)(lookupInitialData[script].maskEnum);
for (idx = 0; idx < DELTA; idx++) {
- if (validityTable[idx] & mask) {
- uset_add(set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN);
+ /* added check for TELUGU character */
+ if ((validityTable[idx] & mask) || (script==TELUGU && idx==0x31)) {
+ sa->add(sa->set, idx + (script * DELTA) + INDIC_BLOCK_BEGIN);
}
}
}
- uset_add(set, DANDA);
- uset_add(set, DOUBLE_DANDA);
- uset_add(set, ZWNJ);
- uset_add(set, ZWJ);
+ sa->add(sa->set, DANDA);
+ sa->add(sa->set, DOUBLE_DANDA);
+ sa->add(sa->set, ZWNJ);
+ sa->add(sa->set, ZWJ);
}
static const UConverterImpl _ISCIIImpl={
UCNV_ISCII,
-
+
NULL,
NULL,
-
+
_ISCIIOpen,
_ISCIIClose,
_ISCIIReset,
-
+
UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
UConverter_toUnicode_ISCII_OFFSETS_LOGIC,
UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
UConverter_fromUnicode_ISCII_OFFSETS_LOGIC,
NULL,
-
+
NULL,
_ISCIIgetName,
NULL,
static const UConverterStaticData _ISCIIStaticData={
sizeof(UConverterStaticData),
"ISCII",
- 0,
- UCNV_IBM,
- UCNV_ISCII,
- 1,
+ 0,
+ UCNV_IBM,
+ UCNV_ISCII,
+ 1,
4,
{ 0x1a, 0, 0, 0 },
0x1,
- FALSE,
+ FALSE,
FALSE,
0x0,
0x0,
{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }, /* reserved */
};
-
-const UConverterSharedData _ISCIIData={
- sizeof(UConverterSharedData),
- ~((uint32_t) 0),
- NULL,
- NULL,
- &_ISCIIStaticData,
- FALSE,
- &_ISCIIImpl,
- 0
-};
+
+const UConverterSharedData _ISCIIData=
+ UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_ISCIIStaticData, &_ISCIIImpl);
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */