+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
/*
******************************************************************************
*
-* Copyright (C) 2002, International Business Machines
+* Copyright (C) 2002-2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: bocu1tst.c
-* encoding: US-ASCII
+* encoding: UTF-8
* tab size: 8 (not used)
* indentation:4
*
*
* This is the reference implementation of BOCU-1,
* the MIME-friendly form of the Binary Ordered Compression for Unicode,
-* taken directly from ### http://oss.software.ibm.com/cvs/icu/icuhtml/design/conversion/bocu1/
+* taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
* The files bocu1.h and bocu1.c from the design folder are taken
* verbatim (minus copyright and #include) and copied together into this file.
* The reference code and some of the reference bocu1tst.c
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "unicode/ucnv.h"
+#include "unicode/utf16.h"
#include "cmemory.h"
#include "cintltst.h"
-#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
-
/* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
/* BOCU-1 constants and macros ---------------------------------------------- */
#define BOCU1_MIN 0x21
#define BOCU1_MIDDLE 0x90
#define BOCU1_MAX_LEAD 0xfe
-#define BOCU1_MAX_TRAIL 0xff
+
+/* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
+#define BOCU1_MAX_TRAIL 0xffL
#define BOCU1_RESET 0xff
/* number of lead bytes */
* to trail byte values 0..19 (0..0x13) as used in the difference calculation.
* External byte values that are illegal as trail bytes are mapped to -1.
*/
-static int8_t
+static const int8_t
bocu1ByteToTrail[BOCU1_MIN]={
/* 0 1 2 3 4 5 6 7 */
-1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
* from trail byte values 0..19 (0..0x13) as used in the difference calculation
* to external byte values 0x00..0x20.
*/
-static int8_t
+static const int8_t
bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
/* 0 1 2 3 4 5 6 7 */
0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
* @param c current code point, 0..0x10ffff
* @return "previous code point" state value
*/
-static U_INLINE int32_t
+static int32_t
bocu1Prev(int32_t c) {
/* compute new prev */
if(0x3040<=c && c<=0x309f) {
/* CJK Unihan */
return 0x4e00-BOCU1_REACH_NEG_2;
} else if(0xac00<=c && c<=0xd7a3) {
- /* Korean Hangul */
- return (0xd7a3+0xac00)/2;
+ /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
+ return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
} else {
/* mostly small scripts */
return (c&~0x7f)+BOCU1_ASCII_PREV;
} while(--count>0);
/* add lead byte */
- result|=(lead+diff)<<shift;
+ result |= (uint32_t)(lead+diff)<<shift;
return result;
}
p0=p;
i=0;
while(i<length) {
- UTF_NEXT_CHAR(s, i, length, c);
+ U16_NEXT(s, i, length, c);
p+=writePacked(encodeBocu1(&prev, c), p);
}
- return p-p0;
+ return (int32_t)(p-p0);
}
/**
return -1;
}
if(c>=0) {
- UTF_APPEND_CHAR_UNSAFE(s, sLength, c);
+ U16_APPEND_UNSAFE(s, sLength, c);
}
}
return sLength;
}
-static U_INLINE char
+static char
hexDigit(uint8_t digit) {
return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
}
/* cintltst code ------------------------------------------------------------ */
+static const int32_t DEFAULT_BUFFER_SIZE = 30000;
+
+
/* test one string with the ICU and the reference BOCU-1 implementations */
static void
roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
- static UChar roundtripRef[30000], roundtripICU[30000];
- static char bocu1Ref[30000], bocu1ICU[30000];
+ UChar *roundtripRef, *roundtripICU;
+ char *bocu1Ref, *bocu1ICU;
int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
UErrorCode errorCode;
+ roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
+ roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
+ bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
+ bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
+
/* Unicode -> BOCU-1 */
bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
errorCode=U_ZERO_ERROR;
- bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, sizeof(bocu1ICU), text, length, &errorCode);
+ bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
- return;
+ goto cleanup;
}
if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
- return;
+ goto cleanup;
}
/* BOCU-1 -> Unicode */
roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
if(roundtripRefLength<0) {
- return; /* readString() found an error and reported it */
+ goto cleanup; /* readString() found an error and reported it */
}
- roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, sizeof(roundtripICU)/U_SIZEOF_UCHAR, bocu1ICU, bocu1ICULength, &errorCode);
+ roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
if(U_FAILURE(errorCode)) {
log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
- return;
+ goto cleanup;
}
if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
- return;
+ goto cleanup;
}
if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
- return;
+ goto cleanup;
}
+cleanup:
+ free(roundtripRef);
+ free(roundtripICU);
+ free(bocu1Ref);
+ free(bocu1ICU);
}
static const UChar feff[]={ 0xfeff };
const UChar *s;
int32_t length;
} strings[]={
- { feff, LENGTHOF(feff) },
- { ascii, LENGTHOF(ascii) },
- { crlf, LENGTHOF(crlf) },
- { nul, LENGTHOF(nul) },
- { latin, LENGTHOF(latin) },
- { devanagari, LENGTHOF(devanagari) },
- { hiragana, LENGTHOF(hiragana) },
- { unihan, LENGTHOF(unihan) },
- { hangul, LENGTHOF(hangul) },
- { surrogates, LENGTHOF(surrogates) },
- { plane1, LENGTHOF(plane1) },
- { plane2, LENGTHOF(plane2) },
- { plane15, LENGTHOF(plane15) },
- { plane16, LENGTHOF(plane16) },
- { c0, LENGTHOF(c0) }
+ { feff, UPRV_LENGTHOF(feff) },
+ { ascii, UPRV_LENGTHOF(ascii) },
+ { crlf, UPRV_LENGTHOF(crlf) },
+ { nul, UPRV_LENGTHOF(nul) },
+ { latin, UPRV_LENGTHOF(latin) },
+ { devanagari, UPRV_LENGTHOF(devanagari) },
+ { hiragana, UPRV_LENGTHOF(hiragana) },
+ { unihan, UPRV_LENGTHOF(unihan) },
+ { hangul, UPRV_LENGTHOF(hangul) },
+ { surrogates, UPRV_LENGTHOF(surrogates) },
+ { plane1, UPRV_LENGTHOF(plane1) },
+ { plane2, UPRV_LENGTHOF(plane2) },
+ { plane15, UPRV_LENGTHOF(plane15) },
+ { plane16, UPRV_LENGTHOF(plane16) },
+ { c0, UPRV_LENGTHOF(c0) }
};
/*
*/
static void
TestBOCU1(void) {
- UChar text[30000];
+ UChar *text;
int32_t i, length;
UConverter *bocu1;
errorCode=U_ZERO_ERROR;
bocu1=ucnv_open("BOCU-1", &errorCode);
if(U_FAILURE(errorCode)) {
- log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
+ log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
return;
}
+ text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
+
/* text 1: each of strings[] once */
length=0;
- for(i=0; i<LENGTHOF(strings); ++i) {
+ for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
u_memcpy(text+length, strings[i].s, strings[i].length);
length+=strings[i].length;
}
/* text 2: each of strings[] twice */
length=0;
- for(i=0; i<LENGTHOF(strings); ++i) {
+ for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
u_memcpy(text+length, strings[i].s, strings[i].length);
length+=strings[i].length;
u_memcpy(text+length, strings[i].s, strings[i].length);
/* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
length=0;
for(i=1; length<5000; i+=7) {
- if(i>=LENGTHOF(strings)) {
- i-=LENGTHOF(strings);
+ if(i>=UPRV_LENGTHOF(strings)) {
+ i-=UPRV_LENGTHOF(strings);
}
u_memcpy(text+length, strings[i].s, strings[i].length);
length+=strings[i].length;
roundtripBOCU1(bocu1, 3, text, length);
ucnv_close(bocu1);
+ free(text);
}
U_CFUNC void addBOCU1Tests(TestNode** root);