]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/tools/toolutil/ucm.c
ICU-6.2.4.tar.gz
[apple/icu.git] / icuSources / tools / toolutil / ucm.c
diff --git a/icuSources/tools/toolutil/ucm.c b/icuSources/tools/toolutil/ucm.c
new file mode 100644 (file)
index 0000000..108e432
--- /dev/null
@@ -0,0 +1,1177 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 2003-2004, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  ucm.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2003jun20
+*   created by: Markus W. Scherer
+*
+*   This file reads a .ucm file, stores its mappings and sorts them.
+*   It implements handling of Unicode conversion mappings from .ucm files
+*   for makeconv, canonucm, rptp2ucm, etc.
+*
+*   Unicode code point sequences with a length of more than 1,
+*   as well as byte sequences with more than 4 bytes or more than one complete
+*   character sequence are handled to support m:n mappings.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "filestrm.h"
+#include "uarrsort.h"
+#include "ucnvmbcs.h"
+#include "ucnv_bld.h"
+#include "ucnv_ext.h"
+#include "uparse.h"
+#include "ucm.h"
+#include <stdio.h>
+
+/* -------------------------------------------------------------------------- */
+
+static void
+printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
+    int32_t j;
+
+    for(j=0; j<m->uLen; ++j) {
+        fprintf(f, "<U%04lX>", (long)codePoints[j]);
+    }
+
+    fputc(' ', f);
+
+    for(j=0; j<m->bLen; ++j) {
+        fprintf(f, "\\x%02X", bytes[j]);
+    }
+
+    if(m->f>=0) {
+        fprintf(f, " |%u\n", m->f);
+    } else {
+        fputs("\n", f);
+    }
+}
+
+U_CAPI void U_EXPORT2
+ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
+    printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
+}
+
+U_CAPI void U_EXPORT2
+ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
+    UCMapping *m;
+    int32_t i, length;
+
+    m=table->mappings;
+    length=table->mappingsLength;
+    if(byUnicode) {
+        for(i=0; i<length; ++m, ++i) {
+            ucm_printMapping(table, m, f);
+        }
+    } else {
+        const int32_t *map=table->reverseMap;
+        for(i=0; i<length; ++i) {
+            ucm_printMapping(table, m+map[i], f);
+        }
+    }
+}
+
+/* mapping comparisons ------------------------------------------------------ */
+
+static int32_t
+compareUnicode(UCMTable *lTable, const UCMapping *l,
+               UCMTable *rTable, const UCMapping *r) {
+    const UChar32 *lu, *ru;
+    int32_t result, i, length;
+
+    if(l->uLen==1 && r->uLen==1) {
+        /* compare two single code points */
+        return l->u-r->u;
+    }
+
+    /* get pointers to the code point sequences */
+    lu=UCM_GET_CODE_POINTS(lTable, l);
+    ru=UCM_GET_CODE_POINTS(rTable, r);
+
+    /* get the minimum length */
+    if(l->uLen<=r->uLen) {
+        length=l->uLen;
+    } else {
+        length=r->uLen;
+    }
+
+    /* compare the code points */
+    for(i=0; i<length; ++i) {
+        result=lu[i]-ru[i];
+        if(result!=0) {
+            return result;
+        }
+    }
+
+    /* compare the lengths */
+    return l->uLen-r->uLen;
+}
+
+static int32_t
+compareBytes(UCMTable *lTable, const UCMapping *l,
+             UCMTable *rTable, const UCMapping *r,
+             UBool lexical) {
+    const uint8_t *lb, *rb;
+    int32_t result, i, length;
+
+    /*
+     * A lexical comparison is used for sorting in the builder, to allow
+     * an efficient search for a byte sequence that could be a prefix
+     * of a previously entered byte sequence.
+     *
+     * Comparing by lengths first is for compatibility with old .ucm tools
+     * like canonucm and rptp2ucm.
+     */
+    if(lexical) {
+        /* get the minimum length and continue */
+        if(l->bLen<=r->bLen) {
+            length=l->bLen;
+        } else {
+            length=r->bLen;
+        }
+    } else {
+        /* compare lengths first */
+        result=l->bLen-r->bLen;
+        if(result!=0) {
+            return result;
+        } else {
+            length=l->bLen;
+        }
+    }
+
+    /* get pointers to the byte sequences */
+    lb=UCM_GET_BYTES(lTable, l);
+    rb=UCM_GET_BYTES(rTable, r);
+
+    /* compare the bytes */
+    for(i=0; i<length; ++i) {
+        result=lb[i]-rb[i];
+        if(result!=0) {
+            return result;
+        }
+    }
+
+    /* compare the lengths */
+    return l->bLen-r->bLen;
+}
+
+/* compare UCMappings for sorting */
+static int32_t
+compareMappings(UCMTable *lTable, const UCMapping *l,
+                UCMTable *rTable, const UCMapping *r,
+                UBool uFirst) {
+    int32_t result;
+
+    /* choose which side to compare first */
+    if(uFirst) {
+        /* Unicode then bytes */
+        result=compareUnicode(lTable, l, rTable, r);
+        if(result==0) {
+            result=compareBytes(lTable, l, rTable, r, FALSE); /* not lexically, like canonucm */
+        }
+    } else {
+        /* bytes then Unicode */
+        result=compareBytes(lTable, l, rTable, r, TRUE); /* lexically, for builder */
+        if(result==0) {
+            result=compareUnicode(lTable, l, rTable, r);
+        }
+    }
+
+    if(result!=0) {
+        return result;
+    }
+
+    /* compare the flags */
+    return l->f-r->f;
+}
+
+/* sorting by Unicode first sorts mappings directly */
+static int32_t
+compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
+    return compareMappings(
+        (UCMTable *)context, (const UCMapping *)left,
+        (UCMTable *)context, (const UCMapping *)right, TRUE);
+}
+
+/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
+static int32_t
+compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
+    UCMTable *table=(UCMTable *)context;
+    int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
+    return compareMappings(
+        table, table->mappings+l,
+        table, table->mappings+r, FALSE);
+}
+
+U_CAPI void U_EXPORT2
+ucm_sortTable(UCMTable *t) {
+    UErrorCode errorCode;
+    int32_t i;
+
+    if(t->isSorted) {
+        return;
+    }
+
+    errorCode=U_ZERO_ERROR;
+
+    /* 1. sort by Unicode first */
+    uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
+                   compareMappingsUnicodeFirst, t,
+                   FALSE, &errorCode);
+
+    /* build the reverseMap */
+    if(t->reverseMap==NULL) {
+        /*
+         * allocate mappingsCapacity instead of mappingsLength so that
+         * if mappings are added, the reverseMap need not be
+         * reallocated each time
+         * (see moveMappings() and ucm_addMapping())
+         */
+        t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
+        if(t->reverseMap==NULL) {
+            fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+    }
+    for(i=0; i<t->mappingsLength; ++i) {
+        t->reverseMap[i]=i;
+    }
+
+    /* 2. sort reverseMap by mappings bytes first */
+    uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
+                   compareMappingsBytesFirst, t,
+                   FALSE, &errorCode);
+
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
+                u_errorName(errorCode));
+        exit(errorCode);
+    }
+
+    t->isSorted=TRUE;
+}
+
+enum {
+    MOVE_TO_EXT=1,
+    REMOVE_MAPPING=2
+};
+
+/*
+ * move mappings with their move flag set from the base table
+ * and optionally to the extension table
+ *
+ * works only with explicit precision flags because it uses some of the
+ * flags bits
+ */
+static void
+moveMappings(UCMTable *base, UCMTable *ext) {
+    UCMapping *mb, *mbLimit;
+    int8_t flag;
+
+    mb=base->mappings;
+    mbLimit=mb+base->mappingsLength;
+
+    while(mb<mbLimit) {
+        flag=mb->moveFlag;
+        if(flag!=0) {
+            /* reset the move flag */
+            mb->moveFlag=0;
+
+            if(ext!=NULL && (flag&MOVE_TO_EXT)) {
+                /* add the mapping to the extension table */
+                ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
+            }
+
+            /* move the last base mapping down and overwrite the current one */
+            if(mb<(mbLimit-1)) {
+                uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
+            }
+            --mbLimit;
+            --base->mappingsLength;
+            base->isSorted=FALSE;
+        } else {
+            ++mb;
+        }
+    }
+}
+
+enum {
+    NEEDS_MOVE=1,
+    HAS_ERRORS=2
+};
+
+static uint8_t
+checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
+                    UBool moveToExt, UBool intersectBase) {
+    UCMapping *mb, *me, *mbLimit, *meLimit;
+    int32_t cmp;
+    uint8_t result;
+
+    mb=base->mappings;
+    mbLimit=mb+base->mappingsLength;
+
+    me=ext->mappings;
+    meLimit=me+ext->mappingsLength;
+
+    result=0;
+
+    for(;;) {
+        /* skip irrelevant mappings on both sides */
+        for(;;) {
+            if(mb==mbLimit) {
+                return result;
+            }
+
+            if(0<=mb->f && mb->f<=2) {
+                break;
+            }
+
+            ++mb;
+        }
+
+        for(;;) {
+            if(me==meLimit) {
+                return result;
+            }
+
+            if(0<=me->f && me->f<=2) {
+                break;
+            }
+
+            ++me;
+        }
+
+        /* compare the base and extension mappings */
+        cmp=compareUnicode(base, mb, ext, me);
+        if(cmp<0) {
+            if(intersectBase && (intersectBase!=2 || mb->bLen>1)) {
+                /*
+                 * mapping in base but not in ext, move it
+                 *
+                 * if ext is DBCS, move DBCS mappings here
+                 * and check SBCS ones for Unicode prefix below
+                 */
+                mb->moveFlag|=MOVE_TO_EXT;
+                result|=NEEDS_MOVE;
+
+            /* does mb map from an input sequence that is a prefix of me's? */
+            } else if( mb->uLen<me->uLen &&
+                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+            ) {
+                if(moveToExt) {
+                    /* mark this mapping to be moved to the extension table */
+                    mb->moveFlag|=MOVE_TO_EXT;
+                    result|=NEEDS_MOVE;
+                } else {
+                    fprintf(stderr,
+                            "ucm error: the base table contains a mapping whose input sequence\n"
+                            "           is a prefix of the input sequence of an extension mapping\n");
+                    ucm_printMapping(base, mb, stderr);
+                    ucm_printMapping(ext, me, stderr);
+                    result|=HAS_ERRORS;
+                }
+            }
+
+            ++mb;
+        } else if(cmp==0) {
+            /*
+             * same output: remove the extension mapping,
+             * otherwise treat as an error
+             */
+            if( mb->f==me->f && mb->bLen==me->bLen &&
+                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+            ) {
+                me->moveFlag|=REMOVE_MAPPING;
+                result|=NEEDS_MOVE;
+            } else if(intersectBase) {
+                /* mapping in base but not in ext, move it */
+                mb->moveFlag|=MOVE_TO_EXT;
+                result|=NEEDS_MOVE;
+            } else {
+                fprintf(stderr,
+                        "ucm error: the base table contains a mapping whose input sequence\n"
+                        "           is the same as the input sequence of an extension mapping\n"
+                        "           but it maps differently\n");
+                ucm_printMapping(base, mb, stderr);
+                ucm_printMapping(ext, me, stderr);
+                result|=HAS_ERRORS;
+            }
+
+            ++mb;
+        } else /* cmp>0 */ {
+            ++me;
+        }
+    }
+}
+
+static uint8_t
+checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
+                  UBool moveToExt, UBool intersectBase) {
+    UCMapping *mb, *me;
+    int32_t *baseMap, *extMap;
+    int32_t b, e, bLimit, eLimit, cmp;
+    uint8_t result;
+    UBool isSISO;
+
+    baseMap=base->reverseMap;
+    extMap=ext->reverseMap;
+
+    b=e=0;
+    bLimit=base->mappingsLength;
+    eLimit=ext->mappingsLength;
+
+    result=0;
+
+    isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
+
+    for(;;) {
+        /* skip irrelevant mappings on both sides */
+        for(;; ++b) {
+            if(b==bLimit) {
+                return result;
+            }
+            mb=base->mappings+baseMap[b];
+
+            if(intersectBase==2 && mb->bLen==1) {
+                /*
+                 * comparing a base against a DBCS extension:
+                 * leave SBCS base mappings alone
+                 */
+                continue;
+            }
+
+            if(mb->f==0 || mb->f==3) {
+                break;
+            }
+        }
+
+        for(;;) {
+            if(e==eLimit) {
+                return result;
+            }
+            me=ext->mappings+extMap[e];
+
+            if(me->f==0 || me->f==3) {
+                break;
+            }
+
+            ++e;
+        }
+
+        /* compare the base and extension mappings */
+        cmp=compareBytes(base, mb, ext, me, TRUE);
+        if(cmp<0) {
+            if(intersectBase) {
+                /* mapping in base but not in ext, move it */
+                mb->moveFlag|=MOVE_TO_EXT;
+                result|=NEEDS_MOVE;
+
+            /*
+             * does mb map from an input sequence that is a prefix of me's?
+             * for SI/SO tables, a single byte is never a prefix because it
+             * occurs in a separate single-byte state
+             */
+            } else if( mb->bLen<me->bLen &&
+                (!isSISO || mb->bLen>1) &&
+                0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+            ) {
+                if(moveToExt) {
+                    /* mark this mapping to be moved to the extension table */
+                    mb->moveFlag|=MOVE_TO_EXT;
+                    result|=NEEDS_MOVE;
+                } else {
+                    fprintf(stderr,
+                            "ucm error: the base table contains a mapping whose input sequence\n"
+                            "           is a prefix of the input sequence of an extension mapping\n");
+                    ucm_printMapping(base, mb, stderr);
+                    ucm_printMapping(ext, me, stderr);
+                    result|=HAS_ERRORS;
+                }
+            }
+
+            ++b;
+        } else if(cmp==0) {
+            /*
+             * same output: remove the extension mapping,
+             * otherwise treat as an error
+             */
+            if( mb->f==me->f && mb->uLen==me->uLen &&
+                0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+            ) {
+                me->moveFlag|=REMOVE_MAPPING;
+                result|=NEEDS_MOVE;
+            } else if(intersectBase) {
+                /* mapping in base but not in ext, move it */
+                mb->moveFlag|=MOVE_TO_EXT;
+                result|=NEEDS_MOVE;
+            } else {
+                fprintf(stderr,
+                        "ucm error: the base table contains a mapping whose input sequence\n"
+                        "           is the same as the input sequence of an extension mapping\n"
+                        "           but it maps differently\n");
+                ucm_printMapping(base, mb, stderr);
+                ucm_printMapping(ext, me, stderr);
+                result|=HAS_ERRORS;
+            }
+
+            ++b;
+        } else /* cmp>0 */ {
+            ++e;
+        }
+    }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
+    UCMapping *m, *mLimit;
+    int32_t count;
+    UBool isOK;
+
+    m=table->mappings;
+    mLimit=m+table->mappingsLength;
+    isOK=TRUE;
+
+    while(m<mLimit) {
+        count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
+        if(count<1) {
+            ucm_printMapping(table, m, stderr);
+            isOK=FALSE;
+        }
+        ++m;
+    }
+
+    return isOK;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkBaseExt(UCMStates *baseStates,
+                 UCMTable *base, UCMTable *ext, UCMTable *moveTarget,
+                 UBool intersectBase) {
+    uint8_t result;
+
+    /* if we have an extension table, we must always use precision flags */
+    if(base->flagsType&UCM_FLAGS_IMPLICIT) {
+        fprintf(stderr, "ucm error: the base table contains mappings without precision flags\n");
+        return FALSE;
+    }
+    if(ext->flagsType&UCM_FLAGS_IMPLICIT) {
+        fprintf(stderr, "ucm error: extension table contains mappings without precision flags\n");
+        return FALSE;
+    }
+
+    /* checking requires both tables to be sorted */
+    ucm_sortTable(base);
+    ucm_sortTable(ext);
+
+    /* check */
+    result=
+        checkBaseExtUnicode(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase)|
+        checkBaseExtBytes(baseStates, base, ext, (UBool)(moveTarget!=NULL), intersectBase);
+
+    if(result&HAS_ERRORS) {
+        return FALSE;
+    }
+
+    if(result&NEEDS_MOVE) {
+        moveMappings(ext, NULL);
+        moveMappings(base, moveTarget);
+        ucm_sortTable(base);
+        ucm_sortTable(ext);
+        if(moveTarget!=NULL) {
+            ucm_sortTable(moveTarget);
+        }
+    }
+
+    return TRUE;
+}
+
+/* merge tables for rptp2ucm ------------------------------------------------ */
+
+U_CAPI void U_EXPORT2
+ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable,
+                const uint8_t *subchar, int32_t subcharLength,
+                uint8_t subchar1) {
+    UCMapping *fromUMapping, *toUMapping;
+    int32_t fromUIndex, toUIndex, fromUTop, toUTop, cmp;
+
+    ucm_sortTable(fromUTable);
+    ucm_sortTable(toUTable);
+
+    fromUMapping=fromUTable->mappings;
+    toUMapping=toUTable->mappings;
+
+    fromUTop=fromUTable->mappingsLength;
+    toUTop=toUTable->mappingsLength;
+
+    fromUIndex=toUIndex=0;
+
+    while(fromUIndex<fromUTop && toUIndex<toUTop) {
+        cmp=compareMappings(fromUTable, fromUMapping, toUTable, toUMapping, TRUE);
+        if(cmp==0) {
+            /* equal: roundtrip, nothing to do (flags are initially 0) */
+            ++fromUMapping;
+            ++toUMapping;
+
+            ++fromUIndex;
+            ++toUIndex;
+        } else if(cmp<0) {
+            /*
+             * the fromU mapping does not have a toU counterpart:
+             * fallback Unicode->codepage
+             */
+            if( (fromUMapping->bLen==subcharLength &&
+                 0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
+                (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
+            ) {
+                fromUMapping->f=2; /* SUB mapping */
+            } else {
+                fromUMapping->f=1; /* normal fallback */
+            }
+
+            ++fromUMapping;
+            ++fromUIndex;
+        } else {
+            /*
+             * the toU mapping does not have a fromU counterpart:
+             * (reverse) fallback codepage->Unicode, copy it to the fromU table
+             */
+
+            /* ignore reverse fallbacks to Unicode SUB */
+            if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
+                toUMapping->f=3; /* reverse fallback */
+                ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
+
+                /* the table may have been reallocated */
+                fromUMapping=fromUTable->mappings+fromUIndex;
+            }
+
+            ++toUMapping;
+            ++toUIndex;
+        }
+    }
+
+    /* either one or both tables are exhausted */
+    while(fromUIndex<fromUTop) {
+        /* leftover fromU mappings are fallbacks */
+        if( (fromUMapping->bLen==subcharLength &&
+             0==uprv_memcmp(UCM_GET_BYTES(fromUTable, fromUMapping), subchar, subcharLength)) ||
+            (subchar1!=0 && fromUMapping->bLen==1 && fromUMapping->b.bytes[0]==subchar1)
+        ) {
+            fromUMapping->f=2; /* SUB mapping */
+        } else {
+            fromUMapping->f=1; /* normal fallback */
+        }
+
+        ++fromUMapping;
+        ++fromUIndex;
+    }
+
+    while(toUIndex<toUTop) {
+        /* leftover toU mappings are reverse fallbacks */
+
+        /* ignore reverse fallbacks to Unicode SUB */
+        if(!(toUMapping->uLen==1 && (toUMapping->u==0xfffd || toUMapping->u==0x1a))) {
+            toUMapping->f=3; /* reverse fallback */
+            ucm_addMapping(fromUTable, toUMapping, UCM_GET_CODE_POINTS(toUTable, toUMapping), UCM_GET_BYTES(toUTable, toUMapping));
+        }
+
+        ++toUMapping;
+        ++toUIndex;
+    }
+
+    fromUTable->isSorted=FALSE;
+}
+
+/* separate extension mappings out of base table for rptp2ucm --------------- */
+
+U_CAPI UBool U_EXPORT2
+ucm_separateMappings(UCMFile *ucm, UBool isSISO) {
+    UCMTable *table;
+    UCMapping *m, *mLimit;
+    int32_t type;
+    UBool needsMove, isOK;
+
+    table=ucm->base;
+    m=table->mappings;
+    mLimit=m+table->mappingsLength;
+
+    needsMove=FALSE;
+    isOK=TRUE;
+
+    for(; m<mLimit; ++m) {
+        if(isSISO && m->bLen==1 && (m->b.bytes[0]==0xe || m->b.bytes[0]==0xf)) {
+            fprintf(stderr, "warning: removing illegal mapping from an SI/SO-stateful table\n");
+            ucm_printMapping(table, m, stderr);
+            m->moveFlag|=REMOVE_MAPPING;
+            needsMove=TRUE;
+            continue;
+        }
+
+        type=ucm_mappingType(
+                &ucm->states, m,
+                UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m));
+        if(type<0) {
+            /* illegal byte sequence */
+            printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), stderr);
+            isOK=FALSE;
+        } else if(type>0) {
+            m->moveFlag|=MOVE_TO_EXT;
+            needsMove=TRUE;
+        }
+    }
+
+    if(!isOK) {
+        return FALSE;
+    }
+    if(needsMove) {
+        moveMappings(ucm->base, ucm->ext);
+        return ucm_checkBaseExt(&ucm->states, ucm->base, ucm->ext, ucm->ext, FALSE);
+    } else {
+        ucm_sortTable(ucm->base);
+        return TRUE;
+    }
+}
+
+/* ucm parser --------------------------------------------------------------- */
+
+U_CAPI int8_t U_EXPORT2
+ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
+    const char *s=*ps;
+    char *end;
+    uint8_t byte;
+    int8_t bLen;
+
+    bLen=0;
+    for(;;) {
+        /* skip an optional plus sign */
+        if(bLen>0 && *s=='+') {
+            ++s;
+        }
+        if(*s!='\\') {
+            break;
+        }
+
+        if( s[1]!='x' ||
+            (byte=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
+        ) {
+            fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
+            return -1;
+        }
+
+        if(bLen==UCNV_EXT_MAX_BYTES) {
+            fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
+            return -1;
+        }
+        bytes[bLen++]=byte;
+        s=end;
+    }
+
+    *ps=s;
+    return bLen;
+}
+
+/* parse a mapping line; must not be empty */
+U_CAPI UBool U_EXPORT2
+ucm_parseMappingLine(UCMapping *m,
+                     UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+                     uint8_t bytes[UCNV_EXT_MAX_BYTES],
+                     const char *line) {
+    const char *s;
+    char *end;
+    UChar32 cp;
+    int32_t u16Length;
+    int8_t uLen, bLen, f;
+
+    s=line;
+    uLen=bLen=0;
+
+    /* parse code points */
+    for(;;) {
+        /* skip an optional plus sign */
+        if(uLen>0 && *s=='+') {
+            ++s;
+        }
+        if(*s!='<') {
+            break;
+        }
+
+        if( s[1]!='U' ||
+            (cp=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
+            *end!='>'
+        ) {
+            fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
+            return FALSE;
+        }
+        if((uint32_t)cp>0x10ffff || U_IS_SURROGATE(cp)) {
+            fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
+            return FALSE;
+        }
+
+        if(uLen==UCNV_EXT_MAX_UCHARS) {
+            fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
+            return FALSE;
+        }
+        codePoints[uLen++]=cp;
+        s=end+1;
+    }
+
+    if(uLen==0) {
+        fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
+        return FALSE;
+    } else if(uLen==1) {
+        m->u=codePoints[0];
+    } else {
+        UErrorCode errorCode=U_ZERO_ERROR;
+        u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
+        if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
+            u16Length>UCNV_EXT_MAX_UCHARS
+        ) {
+            fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
+            return FALSE;
+        }
+    }
+
+    s=u_skipWhitespace(s);
+
+    /* parse bytes */
+    bLen=ucm_parseBytes(bytes, line, &s);
+
+    if(bLen<0) {
+        return FALSE;
+    } else if(bLen==0) {
+        fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
+        return FALSE;
+    } else if(bLen<=4) {
+        uprv_memcpy(m->b.bytes, bytes, bLen);
+    }
+
+    /* skip everything until the fallback indicator, even the start of a comment */
+    for(;;) {
+        if(*s==0) {
+            f=-1; /* no fallback indicator */
+            break;
+        } else if(*s=='|') {
+            f=(int8_t)(s[1]-'0');
+            if((uint8_t)f>3) {
+                fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
+                return FALSE;
+            }
+            break;
+        }
+        ++s;
+    }
+
+    m->uLen=uLen;
+    m->bLen=bLen;
+    m->f=f;
+    return TRUE;
+}
+
+/* general APIs ------------------------------------------------------------- */
+
+U_CAPI UCMTable * U_EXPORT2
+ucm_openTable() {
+    UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
+    if(table==NULL) {
+        fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
+        exit(U_MEMORY_ALLOCATION_ERROR);
+    }
+
+    memset(table, 0, sizeof(UCMTable));
+    return table;
+}
+
+U_CAPI void U_EXPORT2
+ucm_closeTable(UCMTable *table) {
+    if(table!=NULL) {
+        uprv_free(table->mappings);
+        uprv_free(table->codePoints);
+        uprv_free(table->bytes);
+        uprv_free(table->reverseMap);
+        uprv_free(table);
+    }
+}
+
+U_CAPI void U_EXPORT2
+ucm_resetTable(UCMTable *table) {
+    if(table!=NULL) {
+        table->mappingsLength=0;
+        table->flagsType=0;
+        table->unicodeMask=0;
+        table->bytesLength=table->codePointsLength=0;
+        table->isSorted=FALSE;
+    }
+}
+
+U_CAPI void U_EXPORT2
+ucm_addMapping(UCMTable *table,
+               UCMapping *m,
+               UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+               uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+    UCMapping *tm;
+    UChar32 c;
+    int32_t index;
+
+    if(table->mappingsLength>=table->mappingsCapacity) {
+        /* make the mappings array larger */
+        if(table->mappingsCapacity==0) {
+            table->mappingsCapacity=1000;
+        } else {
+            table->mappingsCapacity*=10;
+        }
+        table->mappings=(UCMapping *)uprv_realloc(table->mappings,
+                                             table->mappingsCapacity*sizeof(UCMapping));
+        if(table->mappings==NULL) {
+            fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
+                            (int)table->mappingsCapacity);
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+
+        if(table->reverseMap!=NULL) {
+            /* the reverseMap must be reallocated in a new sort */
+            uprv_free(table->reverseMap);
+            table->reverseMap=NULL;
+        }
+    }
+
+    if(m->uLen>1 && table->codePointsCapacity==0) {
+        table->codePointsCapacity=10000;
+        table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
+        if(table->codePoints==NULL) {
+            fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
+                            (int)table->codePointsCapacity);
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+    }
+
+    if(m->bLen>4 && table->bytesCapacity==0) {
+        table->bytesCapacity=10000;
+        table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
+        if(table->bytes==NULL) {
+            fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
+                            (int)table->bytesCapacity);
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+    }
+
+    if(m->uLen>1) {
+        index=table->codePointsLength;
+        table->codePointsLength+=m->uLen;
+        if(table->codePointsLength>table->codePointsCapacity) {
+            fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+
+        uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
+        m->u=index;
+    }
+
+    if(m->bLen>4) {
+        index=table->bytesLength;
+        table->bytesLength+=m->bLen;
+        if(table->bytesLength>table->bytesCapacity) {
+            fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
+            exit(U_MEMORY_ALLOCATION_ERROR);
+        }
+
+        uprv_memcpy(table->bytes+index, bytes, m->bLen);
+        m->b.index=index;
+    }
+
+    /* set unicodeMask */
+    for(index=0; index<m->uLen; ++index) {
+        c=codePoints[index];
+        if(c>=0x10000) {
+            table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
+        } else if(U_IS_SURROGATE(c)) {
+            table->unicodeMask|=UCNV_HAS_SURROGATES;    /* there are surrogate code points */
+        }
+    }
+
+    /* set flagsType */
+    if(m->f<0) {
+        table->flagsType|=UCM_FLAGS_IMPLICIT;
+    } else {
+        table->flagsType|=UCM_FLAGS_EXPLICIT;
+    }
+
+    tm=table->mappings+table->mappingsLength++;
+    uprv_memcpy(tm, m, sizeof(UCMapping));
+
+    table->isSorted=FALSE;
+}
+
+U_CAPI UCMFile * U_EXPORT2
+ucm_open() {
+    UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
+    if(ucm==NULL) {
+        fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
+        exit(U_MEMORY_ALLOCATION_ERROR);
+    }
+
+    memset(ucm, 0, sizeof(UCMFile));
+
+    ucm->base=ucm_openTable();
+    ucm->ext=ucm_openTable();
+
+    ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
+    ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
+    ucm->states.outputType=-1;
+    ucm->states.minCharLength=ucm->states.maxCharLength=1;
+
+    return ucm;
+}
+
+U_CAPI void U_EXPORT2
+ucm_close(UCMFile *ucm) {
+    if(ucm!=NULL) {
+        uprv_free(ucm->base);
+        uprv_free(ucm->ext);
+        uprv_free(ucm);
+    }
+}
+
+U_CAPI int32_t U_EXPORT2
+ucm_mappingType(UCMStates *baseStates,
+                UCMapping *m,
+                UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+                uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+    /* check validity of the bytes and count the characters in them */
+    int32_t count=ucm_countChars(baseStates, bytes, m->bLen);
+    if(count<1) {
+        /* illegal byte sequence */
+        return -1;
+    }
+
+    /*
+     * Suitable for an ICU conversion base table means:
+     * - a 1:1 mapping
+     * - not a |2 SUB mappings for <subchar1>
+     * - not a |1 fallback to 0x00
+     * - no leading 0x00 bytes
+     */
+    if( m->uLen==1 && count==1 &&
+        !((m->f==2 && m->bLen==1 && baseStates->maxCharLength>1) ||
+          (m->f==1 && m->bLen==1 && bytes[0]==0) ||
+          (m->bLen>1 && bytes[0]==0))
+    ) {
+        return 0; /* suitable for a base table */
+    } else {
+        return 1; /* needs to go into an extension table */
+    }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates,
+                   UCMapping *m,
+                   UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+                   uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+    int32_t type;
+
+    if(m->f==2 && m->uLen>1) {
+        fprintf(stderr, "ucm error: illegal <subchar1> |2 mapping from multiple code points\n");
+        printMapping(m, codePoints, bytes, stderr);
+        return FALSE;
+    }
+
+    if(baseStates!=NULL) {
+        /* check validity of the bytes and count the characters in them */
+        type=ucm_mappingType(baseStates, m, codePoints, bytes);
+        if(type<0) {
+            /* illegal byte sequence */
+            printMapping(m, codePoints, bytes, stderr);
+            return FALSE;
+        }
+    } else {
+        /* not used - adding a mapping for an extension-only table before its base table is read */
+        type=1;
+    }
+
+    /*
+     * Add the mapping to the base table if this is requested and suitable.
+     * Otherwise, add it to the extension table.
+     */
+    if(forBase && type==0) {
+        ucm_addMapping(ucm->base, m, codePoints, bytes);
+    } else {
+        ucm_addMapping(ucm->ext, m, codePoints, bytes);
+    }
+
+    return TRUE;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
+    UCMapping m={ 0 };
+    UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
+    uint8_t bytes[UCNV_EXT_MAX_BYTES];
+
+    const char *s;
+
+    /* ignore empty and comment lines */
+    if(line[0]=='#' || *(s=u_skipWhitespace(line))==0 || *s=='\n' || *s=='\r') {
+        return TRUE;
+    }
+
+    return
+        ucm_parseMappingLine(&m, codePoints, bytes, line) &&
+        ucm_addMappingAuto(ucm, forBase, baseStates, &m, codePoints, bytes);
+}
+
+U_CAPI void U_EXPORT2
+ucm_readTable(UCMFile *ucm, FileStream* convFile,
+              UBool forBase, UCMStates *baseStates,
+              UErrorCode *pErrorCode) {
+    char line[500];
+    char *end;
+    UBool isOK;
+    
+    if(U_FAILURE(*pErrorCode)) {
+        return;
+    }
+
+    isOK=TRUE;
+
+    for(;;) {
+        /* read the next line */
+        if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
+            fprintf(stderr, "incomplete charmap section\n");
+            isOK=FALSE;
+            break;
+        }
+
+        /* remove CR LF */
+        end=uprv_strchr(line, 0);
+        while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
+            --end;
+        }
+        *end=0;
+
+        /* ignore empty and comment lines */
+        if(line[0]==0 || line[0]=='#') {
+            continue;
+        }
+
+        /* stop at the end of the mapping table */
+        if(0==uprv_strcmp(line, "END CHARMAP")) {
+            break;
+        }
+
+        isOK&=ucm_addMappingFromLine(ucm, line, forBase, baseStates);
+    }
+
+    if(!isOK) {
+        *pErrorCode=U_INVALID_TABLE_FORMAT;
+    }
+}