]> git.saurik.com Git - apple/icu.git/blobdiff - icuSources/common/ustrtrns.c
ICU-461.12.tar.gz
[apple/icu.git] / icuSources / common / ustrtrns.c
index 1801d8aeb64afe30bb9bb9358a751708f92730b7..aa18ef5bee24cd73e75f56dfea851e82d8b52ffa 100644 (file)
@@ -1,7 +1,7 @@
 /*
 ******************************************************************************
 *
-*   Copyright (C) 2001-2006, International Business Machines
+*   Copyright (C) 2001-2010, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 #include "ustr_imp.h"
 
 U_CAPI UChar* U_EXPORT2 
-u_strFromUTF32(UChar   *dest,
-               int32_t destCapacity, 
+u_strFromUTF32WithSub(UChar *dest,
+               int32_t destCapacity,
                int32_t *pDestLength,
                const UChar32 *src,
                int32_t srcLength,
-               UErrorCode *pErrorCode)
-{
-    int32_t reqLength = 0;
-    uint32_t ch =0;
-    UChar *pDestLimit =dest+destCapacity;
-    UChar *pDest = dest;
-    const uint32_t *pSrc = (const uint32_t *)src;
+               UChar32 subchar, int32_t *pNumSubstitutions,
+               UErrorCode *pErrorCode) {
+    const UChar32 *srcLimit;
+    UChar32 ch;
+    UChar *destLimit;
+    UChar *pDest;
+    int32_t reqLength;
+    int32_t numSubstitutions;
 
     /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
         return NULL;
     }
-    
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
+        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
+    ) {
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
-     /* Check if the source is null terminated */
-    if(srcLength == -1 ){
-        while(((ch=*pSrc)!=0) && (pDest < pDestLimit)){
-            ++pSrc;
-            if(ch<=0xFFFF){
-                *(pDest++)=(UChar)ch;
-            }else if(ch<=0x10ffff){
-                *(pDest++)=UTF16_LEAD(ch);
-                if(pDest<pDestLimit){
-                    *(pDest++)=UTF16_TRAIL(ch);
-                }else{
-                    reqLength++;
-                    break;
-                }
-            }else{
-                *pErrorCode = U_INVALID_CHAR_FOUND;
-                return NULL;
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = 0;
+    }
+
+    pDest = dest;
+    destLimit = dest + destCapacity;
+    reqLength = 0;
+    numSubstitutions = 0;
+
+    if(srcLength < 0) {
+        /* simple loop for conversion of a NUL-terminated BMP string */
+        while((ch=*src) != 0 &&
+              ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
+            ++src;
+            if(pDest < destLimit) {
+                *pDest++ = (UChar)ch;
+            } else {
+                ++reqLength;
             }
         }
-        while((ch=*pSrc++) != 0){
-            reqLength+=UTF_CHAR_LENGTH(ch);
+        srcLimit = src;
+        if(ch != 0) {
+            /* "complicated" case, find the end of the remaining string */
+            while(*++srcLimit != 0) {}
         }
-    }else{
-        const uint32_t* pSrcLimit = ((const uint32_t*)pSrc) + srcLength;
-        while((pSrc < pSrcLimit) && (pDest < pDestLimit)){
-            ch = *pSrc++;
-            if(ch<=0xFFFF){
-                *(pDest++)=(UChar)ch;
-            }else if(ch<=0x10FFFF){
-                *(pDest++)=UTF16_LEAD(ch);
-                if(pDest<pDestLimit){
-                    *(pDest++)=UTF16_TRAIL(ch);
-                }else{
-                    reqLength++;
-                    break;
+    } else {
+        srcLimit = src + srcLength;
+    }
+
+    /* convert with length */
+    while(src < srcLimit) {
+        ch = *src++;
+        do {
+            /* usually "loops" once; twice only for writing subchar */
+            if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
+                if(pDest < destLimit) {
+                    *pDest++ = (UChar)ch;
+                } else {
+                    ++reqLength;
+                }
+                break;
+            } else if(0x10000 <= ch && ch <= 0x10ffff) {
+                if((pDest + 2) <= destLimit) {
+                    *pDest++ = U16_LEAD(ch);
+                    *pDest++ = U16_TRAIL(ch);
+                } else {
+                    reqLength += 2;
                 }
-            }else{
+                break;
+            } else if((ch = subchar) < 0) {
+                /* surrogate code point, or not a Unicode code point at all */
                 *pErrorCode = U_INVALID_CHAR_FOUND;
                 return NULL;
+            } else {
+                ++numSubstitutions;
             }
-        }
-        while(pSrc <pSrcLimit){
-            ch = *pSrc++;
-            reqLength+=UTF_CHAR_LENGTH(ch);
-        }
+        } while(TRUE);
     }
 
     reqLength += (int32_t)(pDest - dest);
-    if(pDestLength){
+    if(pDestLength) {
         *pDestLength = reqLength;
     }
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = numSubstitutions;
+    }
 
     /* Terminate the buffer */
-    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 
+    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
     
     return dest;
 }
 
+U_CAPI UChar* U_EXPORT2 
+u_strFromUTF32(UChar *dest,
+               int32_t destCapacity, 
+               int32_t *pDestLength,
+               const UChar32 *src,
+               int32_t srcLength,
+               UErrorCode *pErrorCode) {
+    return u_strFromUTF32WithSub(
+            dest, destCapacity, pDestLength,
+            src, srcLength,
+            U_SENTINEL, NULL,
+            pErrorCode);
+}
 
 U_CAPI UChar32* U_EXPORT2 
-u_strToUTF32(UChar32 *dest, 
-             int32_t  destCapacity,
-             int32_t  *pDestLength,
-             const UChar *src, 
-             int32_t  srcLength,
-             UErrorCode *pErrorCode)
-{
-    const UChar* pSrc = src;
-    const UChar* pSrcLimit;
-    int32_t reqLength=0;
-    uint32_t ch=0;
-    uint32_t *pDest = (uint32_t *)dest;
-    uint32_t *pDestLimit = pDest + destCapacity;
-    UChar ch2=0;
+u_strToUTF32WithSub(UChar32 *dest,
+             int32_t destCapacity,
+             int32_t *pDestLength,
+             const UChar *src,
+             int32_t srcLength,
+             UChar32 subchar, int32_t *pNumSubstitutions,
+             UErrorCode *pErrorCode) {
+    const UChar *srcLimit;
+    UChar32 ch;
+    UChar ch2;
+    UChar32 *destLimit;
+    UChar32 *pDest;
+    int32_t reqLength;
+    int32_t numSubstitutions;
 
     /* args check */
-    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
+    if(U_FAILURE(*pErrorCode)){
         return NULL;
     }
-    
-    
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)){
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
+        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
+    ) {
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
-    if(srcLength==-1) {
-        while((ch=*pSrc)!=0 && pDest!=pDestLimit) {
-            ++pSrc;
-            /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
-            if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 
-                ++pSrc;
-                ch=UTF16_GET_PAIR_VALUE(ch, ch2);
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = 0;
+    }
+
+    pDest = dest;
+    destLimit = dest + destCapacity;
+    reqLength = 0;
+    numSubstitutions = 0;
+
+    if(srcLength < 0) {
+        /* simple loop for conversion of a NUL-terminated BMP string */
+        while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
+            ++src;
+            if(pDest < destLimit) {
+                *pDest++ = ch;
+            } else {
+                ++reqLength;
             }
-            *(pDest++)= ch;
         }
-        while((ch=*pSrc++)!=0) {
-            if(UTF_IS_LEAD(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
-                ++pSrc;
-            }
-            ++reqLength; 
+        srcLimit = src;
+        if(ch != 0) {
+            /* "complicated" case, find the end of the remaining string */
+            while(*++srcLimit != 0) {}
         }
     } else {
-        pSrcLimit = pSrc+srcLength;
-        while(pSrc<pSrcLimit && pDest<pDestLimit) {
-            ch=*pSrc++;
-            if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
-                ++pSrc;
-                ch=UTF16_GET_PAIR_VALUE(ch, ch2);
-            }
-            *(pDest++)= ch;
+        srcLimit = src + srcLength;
+    }
+
+    /* convert with length */
+    while(src < srcLimit) {
+        ch = *src++;
+        if(!U16_IS_SURROGATE(ch)) {
+            /* write or count ch below */
+        } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
+            ++src;
+            ch = U16_GET_SUPPLEMENTARY(ch, ch2);
+        } else if((ch = subchar) < 0) {
+            /* unpaired surrogate */
+            *pErrorCode = U_INVALID_CHAR_FOUND;
+            return NULL;
+        } else {
+            ++numSubstitutions;
         }
-        while(pSrc!=pSrcLimit) {
-            ch=*pSrc++;
-            if(UTF_IS_LEAD(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
-                ++pSrc;
-            }
+        if(pDest < destLimit) {
+            *pDest++ = ch;
+        } else {
             ++reqLength;
         }
     }
 
-    reqLength+=(int32_t)(pDest - (uint32_t *)dest);
-    if(pDestLength){
+    reqLength += (int32_t)(pDest - dest);
+    if(pDestLength) {
         *pDestLength = reqLength;
     }
+    if(pNumSubstitutions != NULL) {
+        *pNumSubstitutions = numSubstitutions;
+    }
 
     /* Terminate the buffer */
-    u_terminateUChar32s(dest,destCapacity,reqLength,pErrorCode);
+    u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
 
     return dest;
 }
 
+U_CAPI UChar32* U_EXPORT2 
+u_strToUTF32(UChar32 *dest, 
+             int32_t destCapacity,
+             int32_t *pDestLength,
+             const UChar *src, 
+             int32_t srcLength,
+             UErrorCode *pErrorCode) {
+    return u_strToUTF32WithSub(
+            dest, destCapacity, pDestLength,
+            src, srcLength,
+            U_SENTINEL, NULL,
+            pErrorCode);
+}
+
 /* for utf8_nextCharSafeBodyTerminated() */
 static const UChar32
 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
@@ -334,7 +398,6 @@ u_strFromUTF8WithSub(UChar *dest,
               int32_t srcLength,
               UChar32 subchar, int32_t *pNumSubstitutions,
               UErrorCode *pErrorCode){
-
     UChar *pDest = dest;
     UChar *pDestLimit = dest+destCapacity;
     UChar32 ch;
@@ -348,13 +411,17 @@ u_strFromUTF8WithSub(UChar *dest,
         return NULL;
     }
         
-    if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     ) {
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=0;
+    }
     numSubstitutions=0;
 
     /*
@@ -534,12 +601,7 @@ u_strFromUTF8WithSub(UChar *dest,
                         *(pDest++)=(UChar)ch;
                     }else{
                         *(pDest++)=UTF16_LEAD(ch);
-                        if(pDest<pDestLimit){
-                            *(pDest++)=UTF16_TRAIL(ch);
-                        }else{
-                            reqLength++;
-                            break;
-                        }
+                        *(pDest++)=UTF16_TRAIL(ch);
                     }
                 }
             } while(--count > 0);
@@ -594,7 +656,7 @@ u_strFromUTF8WithSub(UChar *dest,
                 }
             }
         }
-        /* donot fill the dest buffer just count the UChars needed */
+        /* do not fill the dest buffer just count the UChars needed */
         while(pSrc < pSrcLimit){
             ch = *pSrc;
             if(ch <= 0x7f){
@@ -673,7 +735,6 @@ u_strFromUTF8Lenient(UChar *dest,
                      const char *src,
                      int32_t srcLength,
                      UErrorCode *pErrorCode) {
-
     UChar *pDest = dest;
     UChar32 ch;
     int32_t reqLength = 0;
@@ -684,7 +745,9 @@ u_strFromUTF8Lenient(UChar *dest,
         return NULL;
     }
         
-    if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (destCapacity<0) || (dest == NULL && destCapacity > 0)
+    ) {
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
@@ -839,7 +902,7 @@ u_strFromUTF8Lenient(UChar *dest,
             } else if(ch < 0xe0) { /* U+0080..U+07FF */
                 if(pSrc < pSrcLimit) {
                     /* 0x3080 = (0xc0 << 6) + 0x80 */
-                    *pDest++ = (UChar)(((ch & 0x1f) << 6) + *pSrc++ - 0x3080);
+                    *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
                     continue;
                 }
             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
@@ -912,7 +975,6 @@ u_strToUTF8WithSub(char *dest,
             int32_t srcLength,
             UChar32 subchar, int32_t *pNumSubstitutions,
             UErrorCode *pErrorCode){
-
     int32_t reqLength=0;
     uint32_t ch=0,ch2=0;
     uint8_t *pDest = (uint8_t *)dest;
@@ -924,13 +986,17 @@ u_strToUTF8WithSub(char *dest,
         return NULL;
     }
         
-    if( (pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
+    if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
+        (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     ) {
         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
         return NULL;
     }
 
+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=0;
+    }
     numSubstitutions=0;
 
     if(srcLength==-1) {
@@ -938,7 +1004,7 @@ u_strToUTF8WithSub(char *dest,
             ++pSrc;
             if(ch <= 0x7f) {
                 if(pDest<pDestLimit) {
-                    *pDest++ = (char)ch;
+                    *pDest++ = (uint8_t)ch;
                 } else {
                     reqLength = 1;
                     break;
@@ -1032,7 +1098,7 @@ u_strToUTF8WithSub(char *dest,
             do {
                 ch=*pSrc++;
                 if(ch <= 0x7f) {
-                    *pDest++ = (char)ch;
+                    *pDest++ = (uint8_t)ch;
                 } else if(ch <= 0x7ff) {
                     *pDest++=(uint8_t)((ch>>6)|0xc0);
                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
@@ -1081,7 +1147,7 @@ u_strToUTF8WithSub(char *dest,
             ch=*pSrc++;
             if(ch <= 0x7f) {
                 if(pDest<pDestLimit) {
-                    *pDest++ = (char)ch;
+                    *pDest++ = (uint8_t)ch;
                 } else {
                     reqLength = 1;
                     break;
@@ -1161,9 +1227,8 @@ u_strToUTF8WithSub(char *dest,
     }
 
     /* Terminate the buffer */
-    u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode);
-
-    return (char*)dest;
+    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
+    return dest;
 }
 
 U_CAPI char* U_EXPORT2 
@@ -1179,3 +1244,397 @@ u_strToUTF8(char *dest,
             U_SENTINEL, NULL,
             pErrorCode);
 }
+
+U_CAPI UChar* U_EXPORT2
+u_strFromJavaModifiedUTF8WithSub(
+        UChar *dest,
+        int32_t destCapacity,
+        int32_t *pDestLength,
+        const char *src,
+        int32_t srcLength,
+        UChar32 subchar, int32_t *pNumSubstitutions,
+        UErrorCode *pErrorCode) {
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    UChar32 ch;
+    int32_t reqLength = 0;
+    const uint8_t* pSrc = (const uint8_t*) src;
+    const uint8_t *pSrcLimit;
+    int32_t count;
+    uint8_t t1, t2; /* trail bytes */
+    int32_t numSubstitutions;
+
+    /* args check */
+    if(U_FAILURE(*pErrorCode)){
+        return NULL;
+    }
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (dest==NULL && destCapacity!=0) || destCapacity<0 ||
+        subchar > 0x10ffff || U_IS_SURROGATE(subchar)
+    ) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
+    }
+
+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=0;
+    }
+    numSubstitutions=0;
+
+    if(srcLength < 0) {
+        /*
+         * Transform a NUL-terminated ASCII string.
+         * Handle non-ASCII strings with slower code.
+         */
+        while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
+            *pDest++=(UChar)ch;
+            ++pSrc;
+        }
+        if(ch == 0) {
+            reqLength=(int32_t)(pDest - dest);
+            if(pDestLength) {
+                *pDestLength = reqLength;
+            }
+
+            /* Terminate the buffer */
+            u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
+            return dest;
+        }
+        srcLength = uprv_strlen((const char *)pSrc);
+    }
+
+    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+    pSrcLimit = pSrc + srcLength;
+    for(;;) {
+        count = (int32_t)(pDestLimit - pDest);
+        srcLength = (int32_t)(pSrcLimit - pSrc);
+        if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
+            /* fast ASCII loop */
+            const uint8_t *prevSrc = pSrc;
+            int32_t delta;
+            while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
+                *pDest++=(UChar)ch;
+                ++pSrc;
+            }
+            delta = (int32_t)(pSrc - prevSrc);
+            count -= delta;
+            srcLength -= delta;
+        }
+        /*
+         * Each iteration of the inner loop progresses by at most 3 UTF-8
+         * bytes and one UChar.
+         */
+        srcLength /= 3;
+        if(count > srcLength) {
+            count = srcLength; /* min(remaining dest, remaining src/3) */
+        }
+        if(count < 3) {
+            /*
+             * Too much overhead if we get near the end of the string,
+             * continue with the next loop.
+             */
+            break;
+        }
+        do {
+            ch = *pSrc;
+            if(ch <= 0x7f){
+                *pDest++=(UChar)ch;
+                ++pSrc;
+            } else {
+                if(ch >= 0xe0) {
+                    if( /* handle U+0000..U+FFFF inline */
+                        ch <= 0xef &&
+                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
+                        (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                    ) {
+                        /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
+                        *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
+                        pSrc += 3;
+                        continue;
+                    }
+                } else {
+                    if( /* handle U+0000..U+07FF inline */
+                        ch >= 0xc0 &&
+                        (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                    ) {
+                        *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
+                        pSrc += 2;
+                        continue;
+                    }
+                }
+
+                if(subchar < 0) {
+                    *pErrorCode = U_INVALID_CHAR_FOUND;
+                    return NULL;
+                } else if(subchar > 0xffff && --count == 0) {
+                    /*
+                     * We need to write two UChars, adjusted count for that,
+                     * and ran out of space.
+                     */
+                    break;
+                } else {
+                    /* function call for error cases */
+                    ++pSrc; /* continue after the lead byte */
+                    utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                    ++numSubstitutions;
+                    if(subchar<=0xFFFF) {
+                        *(pDest++)=(UChar)subchar;
+                    } else {
+                        *(pDest++)=U16_LEAD(subchar);
+                        *(pDest++)=U16_TRAIL(subchar);
+                    }
+                }
+            }
+        } while(--count > 0);
+    }
+
+    while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
+        ch = *pSrc;
+        if(ch <= 0x7f){
+            *pDest++=(UChar)ch;
+            ++pSrc;
+        } else {
+            if(ch >= 0xe0) {
+                if( /* handle U+0000..U+FFFF inline */
+                    ch <= 0xef &&
+                    ((pSrcLimit - pSrc) >= 3) &&
+                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
+                    (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
+                ) {
+                    /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
+                    *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
+                    pSrc += 3;
+                    continue;
+                }
+            } else {
+                if( /* handle U+0000..U+07FF inline */
+                    ch >= 0xc0 &&
+                    ((pSrcLimit - pSrc) >= 2) &&
+                    (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
+                ) {
+                    *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
+                    pSrc += 2;
+                    continue;
+                }
+            }
+
+            if(subchar < 0) {
+                *pErrorCode = U_INVALID_CHAR_FOUND;
+                return NULL;
+            } else {
+                /* function call for error cases */
+                ++pSrc; /* continue after the lead byte */
+                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                ++numSubstitutions;
+                if(subchar<=0xFFFF) {
+                    *(pDest++)=(UChar)subchar;
+                } else {
+                    *(pDest++)=U16_LEAD(subchar);
+                    if(pDest<pDestLimit) {
+                        *(pDest++)=U16_TRAIL(subchar);
+                    } else {
+                        reqLength++;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    /* do not fill the dest buffer just count the UChars needed */
+    while(pSrc < pSrcLimit){
+        ch = *pSrc;
+        if(ch <= 0x7f) {
+            reqLength++;
+            ++pSrc;
+        } else {
+            if(ch >= 0xe0) {
+                if( /* handle U+0000..U+FFFF inline */
+                    ch <= 0xef &&
+                    ((pSrcLimit - pSrc) >= 3) &&
+                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
+                    (uint8_t)(pSrc[2] - 0x80) <= 0x3f
+                ) {
+                    reqLength++;
+                    pSrc += 3;
+                    continue;
+                }
+            } else {
+                if( /* handle U+0000..U+07FF inline */
+                    ch >= 0xc0 &&
+                    ((pSrcLimit - pSrc) >= 2) &&
+                    (uint8_t)(pSrc[1] - 0x80) <= 0x3f
+                ) {
+                    reqLength++;
+                    pSrc += 2;
+                    continue;
+                }
+            }
+
+            if(subchar < 0) {
+                *pErrorCode = U_INVALID_CHAR_FOUND;
+                return NULL;
+            } else {
+                /* function call for error cases */
+                ++pSrc; /* continue after the lead byte */
+                utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
+                ++numSubstitutions;
+                reqLength+=U16_LENGTH(ch);
+            }
+        }
+    }
+
+    if(pNumSubstitutions!=NULL) {
+        *pNumSubstitutions=numSubstitutions;
+    }
+
+    reqLength+=(int32_t)(pDest - dest);
+    if(pDestLength) {
+        *pDestLength = reqLength;
+    }
+
+    /* Terminate the buffer */
+    u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
+    return dest;
+}
+
+U_CAPI char* U_EXPORT2 
+u_strToJavaModifiedUTF8(
+        char *dest,
+        int32_t destCapacity,
+        int32_t *pDestLength,
+        const UChar *src, 
+        int32_t srcLength,
+        UErrorCode *pErrorCode) {
+    int32_t reqLength=0;
+    uint32_t ch=0;
+    uint8_t *pDest = (uint8_t *)dest;
+    uint8_t *pDestLimit = pDest + destCapacity;
+    const UChar *pSrcLimit;
+    int32_t count;
+
+    /* args check */
+    if(U_FAILURE(*pErrorCode)){
+        return NULL;
+    }
+    if( (src==NULL && srcLength!=0) || srcLength < -1 ||
+        (dest==NULL && destCapacity!=0) || destCapacity<0
+    ) {
+        *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return NULL;
+    }
+
+    if(srcLength==-1) {
+        /* Convert NUL-terminated ASCII, then find the string length. */
+        while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
+            *pDest++ = (uint8_t)ch;
+            ++src;
+        }
+        if(ch == 0) {
+            reqLength=(int32_t)(pDest - (uint8_t *)dest);
+            if(pDestLength) {
+                *pDestLength = reqLength;
+            }
+
+            /* Terminate the buffer */
+            u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
+            return dest;
+        }
+        srcLength = u_strlen(src);
+    }
+
+    /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
+    pSrcLimit = src+srcLength;
+    for(;;) {
+        count = (int32_t)(pDestLimit - pDest);
+        srcLength = (int32_t)(pSrcLimit - src);
+        if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
+            /* fast ASCII loop */
+            const UChar *prevSrc = src;
+            int32_t delta;
+            while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
+                *pDest++=(uint8_t)ch;
+                ++src;
+            }
+            delta = (int32_t)(src - prevSrc);
+            count -= delta;
+            srcLength -= delta;
+        }
+        /*
+         * Each iteration of the inner loop progresses by at most 3 UTF-8
+         * bytes and one UChar.
+         */
+        count /= 3;
+        if(count > srcLength) {
+            count = srcLength; /* min(remaining dest/3, remaining src) */
+        }
+        if(count < 3) {
+            /*
+             * Too much overhead if we get near the end of the string,
+             * continue with the next loop.
+             */
+            break;
+        }
+        do {
+            ch=*src++;
+            if(ch <= 0x7f && ch != 0) {
+                *pDest++ = (uint8_t)ch;
+            } else if(ch <= 0x7ff) {
+                *pDest++=(uint8_t)((ch>>6)|0xc0);
+                *pDest++=(uint8_t)((ch&0x3f)|0x80);
+            } else {
+                *pDest++=(uint8_t)((ch>>12)|0xe0);
+                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
+                *pDest++=(uint8_t)((ch&0x3f)|0x80);
+            }
+        } while(--count > 0);
+    }
+
+    while(src<pSrcLimit) {
+        ch=*src++;
+        if(ch <= 0x7f && ch != 0) {
+            if(pDest<pDestLimit) {
+                *pDest++ = (uint8_t)ch;
+            } else {
+                reqLength = 1;
+                break;
+            }
+        } else if(ch <= 0x7ff) {
+            if((pDestLimit - pDest) >= 2) {
+                *pDest++=(uint8_t)((ch>>6)|0xc0);
+                *pDest++=(uint8_t)((ch&0x3f)|0x80);
+            } else {
+                reqLength = 2;
+                break;
+            }
+        } else {
+            if((pDestLimit - pDest) >= 3) {
+                *pDest++=(uint8_t)((ch>>12)|0xe0);
+                *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
+                *pDest++=(uint8_t)((ch&0x3f)|0x80);
+            } else {
+                reqLength = 3;
+                break;
+            }
+        }
+    }
+    while(src<pSrcLimit) {
+        ch=*src++;
+        if(ch <= 0x7f && ch != 0) {
+            ++reqLength;
+        } else if(ch<=0x7ff) {
+            reqLength+=2;
+        } else {
+            reqLength+=3;
+        }
+    }
+
+    reqLength+=(int32_t)(pDest - (uint8_t *)dest);
+    if(pDestLength){
+        *pDestLength = reqLength;
+    }
+
+    /* Terminate the buffer */
+    u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
+    return dest;
+}