X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/46f4442e9a5a4f3b98b7c1083586332f6a8a99a4..efa1e6592fb03ce23b15276b2b91d885a3ee7da5:/icuSources/data/unidata/changes.txt diff --git a/icuSources/data/unidata/changes.txt b/icuSources/data/unidata/changes.txt index 2d1dfcc3..5e88414d 100644 --- a/icuSources/data/unidata/changes.txt +++ b/icuSources/data/unidata/changes.txt @@ -1,4 +1,4 @@ -* Copyright (C) 2004-2008, International Business Machines +* Copyright (C) 2004-2012, International Business Machines * Corporation and others. All Rights Reserved. * * file name: changes.txt @@ -13,6 +13,1042 @@ ---------------------------------------------------------------------------- *** +Unicode 6.2 update + +http://www.unicode.org/review/pri230/ +http://www.unicode.org/versions/beta-6.2.0.html +http://www.unicode.org/reports/tr44/tr44-9.html#Unicode_6.2.0 +http://www.unicode.org/review/pri227/ Changes to Script Extensions Property Values +http://www.unicode.org/review/pri228/ Changing some common characters from Punctuation to Symbol +http://www.unicode.org/review/pri229/ Linebreaking Changes for Pictographic Symbols +http://www.unicode.org/reports/tr46/tr46-8.html IDNA +http://unicode.org/Public/idna/6.2.0/ + +*** ICU Trac + +- ticket 9515: Unicode 6.2: final ICU update + +- ticket 9514: UCA 6.2: fix UCARules.txt + +- ticket 9437: update ICU to Unicode 6.2 +- C++ branches/markus/uni62 at r32050 from trunk at r32041 +- Java branches/markus/uni62 at r32068 from trunk at r32066 + +*** Unicode version numbers +- makedata.mak +- uchar.h + (configure.in & configure: have been modified to extract the version from uchar.h) +- com.ibm.icu.util.VersionInfo +- com.ibm.icu.dev.test.lang.UCharacterTest.VERSION_ + +*** data files & enums & parser code + +* file preparation + +- download UCD, UCA & IDNA files +- make sure that the Unicode data folder passed into preparseucd.py + includes a copy of the latest IdnaMappingTable.txt (can be in some subfolder) +- modify preparseucd.py: NamesList.txt is now in UTF-8 +- ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni62/20120816 ~/svn.icu/uni62/src ~/svn.icu/tools/trunk/src +- This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. +- Check test file diffs for previously commented-out, known-failing data lines; + probably need to keep those commented out. + +* PropertyValueAliases.txt changes +- 1 new Line_Break (lb) value: + lb ; RI ; Regional_Indicator + -> uchar.h & UCharacter.LineBreak +- 1 new Word_Break (WB) value: + WB ; RI ; Regional_Indicator + -> uchar.h & UCharacter.WordBreak +- 1 new Grapheme_Cluster_Break (GCB) value: + GCB; RI ; Regional_Indicator + -> uchar.h & UCharacter.GraphemeClusterBreak + +* 3 new numeric values + The new value -1, which was really supposed to be NaN but that would have required + new UnicodeData.txt syntax, can already be represented as a "fraction" of -1/1, + but encodeNumericValue() in corepropsbuilder.cpp had to be fixed. + cp;12456;na=CUNEIFORM NUMERIC SIGN NIGIDAMIN;nv=-1 + cp;12457;na=CUNEIFORM NUMERIC SIGN NIGIDAESH;nv=-1 + The two new values 216000 and 432000 require an addition to the encoding of numeric values. + cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000 + cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000 + -> uprops.h, uchar.c & UCharacterProperty.java + -> cucdtst.c & UCharacterTest.java + +* generate normalization data files +- ~/svn.icu/uni62/dbg$ export LD_LIBRARY_PATH=~/svn.icu/uni62/dbg/lib +- ~/svn.icu/uni62/dbg$ SRC_DATA_IN=~/svn.icu/uni62/src/source/data/in +- ~/svn.icu/uni62/dbg$ UNIDATA=~/svn.icu/uni62/src/source/data/unidata +- ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt +- ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt +- ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt +- ~/svn.icu/uni62/dbg$ bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt + +* build ICU (make install) + so that the tools build can pick up the new definitions from the installed header files. +* build Unicode tools using CMake+make + +* generate core properties data files +- ~/svn.icu/tools/trunk/dbg/unicode$ c/genprops/genprops ~/svn.icu/uni62/src +- in initial bootstrapping, change the UCA version + in source/data/unidata/FractionalUCA.txt to match the new Unicode version +- ~/svn.icu/tools/trunk/dbg/unicode$ c/genuca/genuca -i ~/svn.icu/uni62/dbg/data/out/build/icudt50l ~/svn.icu/uni62/src +- rebuild ICU (make install) & tools + + if genrb fails to build coll/root.res with an U_INVALID_FORMAT_ERROR, + check if the UCA version in FractionalUCA.txt matches the new Unicode version + (see step above) +- run genuca again (see step above) so that it picks up the new case mappings and nfc.nrm +- rebuild ICU (make install) & tools + +* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to + sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) +- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters +- Unicode 6.0..6.2: U+2260, U+226E, U+226F +- nothing new in 6.2, no test file to update + +* update Java data files +- refresh just the UCD-related files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir /tmp/icu4j +- ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + output: + ... + Unicode .icu files built to ./out/build/icudt50l + mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt50b + mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt50b + echo pnames.icu ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt + LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt50l.dat ./out/icu4j/icudt50b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt50l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt50b + mv ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt50b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt50b" + jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt50b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data + jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt50b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data + make[1]: Leaving directory `/home/mscherer/svn.icu/uni62/dbg/data' +- copy the big-endian Unicode data files to another location, + separate from the other data files + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/brkitr + ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt50b + ~/svn.icu/uni62/dbg/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/cnvalias.icu + ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt50b + ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll + ~/svn.icu/uni62/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/brkitr +- refresh ICU4J + ~/svn.icu/uni62/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt50b + +* refresh Java test .txt files +- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode + +* UCA + +- get output from Mark's tools; look in http://www.unicode.org/Public/UCA// +- CLDR root files for ICU are in CollationAuxiliary.zip; unpack that +- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt + (note removing the underscore before "Rules") +- update (ICU4C)/source/test/testdata/CollationTest_*.txt + and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt + with output from Mark's Unicode tools (..._CLDR_..._SHORT.txt) +- check test file diffs for previously commented-out, known-failing data lines; + probably need to keep those commented out +- check FractionalUCA.txt for manual changes of lead bytes from IMPLICIT to Hani +- run genuca, see command line above +- rebuild ICU4C +- refresh ICU4J collation data: + (subset of instructions above for properties data refresh, except copies all coll/*) + ~/svn.icu/uni62/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + ~/svn.icu/uni62/bld$ mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll + ~/svn.icu/uni62/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt50b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt50b/coll + ~/svn.icu/uni62/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt50b +- run all tests with the *_SHORT.txt or the full files (the full ones have comments, useful for debugging) +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +* test ICU, fix test code where necessary + +* When refreshing all of ICU4J data from ICU4C +- ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data +or +- ~/svn.icu/uni62/dbg$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install + +*** LayoutEngine script information +- skipped for Unicode 6.2: no new scripts + +*** merge the Unicode update branches back onto the trunk +- do not merge the icudata.jar and testdata.jar, + instead rebuild them from merged & tested ICU4C + +---------------------------------------------------------------------------- *** + +Future Unicode update + +Tools simplified since the Unicode 6.1 update. See +- http://site.icu-project.org/design/props/ppucd +- http://bugs.icu-project.org/trac/wiki/Markus/ReviewTicket8972 + +* Unicode version numbers +- icutools/unicode/makedefs.sh was deleted, so one fewer place for version & path updates + +* file preparation +- ucdcopy.py, idna2nrm.py and genpname/preparse.pl replaced by preparseucd.py: +- ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20120118 ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src +- This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. +- Check test file diffs for previously commented-out, known-failing data lines; + probably need to keep those commented out. + +* PropertyValueAliases.txt changes +- Script codes that are in ISO 15924 but not in Unicode are now listed in + preparseucd.py, in the _scripts_only_in_iso15924 variable. + If there are new ISO codes, then add them. + If Unicode adds some of them, then remove them from the .py variable. + +* UnicodeData.txt changes +- No more manual changes for CJK ranges for algorithmic names; + those are now written to ppucd.txt and genprops reads them from there. + +* generate core properties data files (makeprops.sh was deleted) +- ~/svn.icu/tools/trunk/dbg/unicode$ c/genprops/genprops ~/svn.icu/trunk/src + +* no more manual updates of source/data/unidata/norm2/nfkc_cf.txt +- it is now generated by preparseucd.py + +* no more separate idna2nrm.py run and manual copying to generate source/data/unidata/norm2/uts46.txt +- it is now generated by preparseucd.py +- make sure that the Unicode data folder passed into preparseucd.py + includes a copy of http://www.unicode.org/Public/idna/6.1.0/IdnaMappingTable.txt + (can be in some subfolder) + +* generate normalization data files +- ~/svn.icu/trunk/dbg$ export LD_LIBRARY_PATH=~/svn.icu/trunk/dbg/lib +- ~/svn.icu/trunk/dbg$ SRC_DATA_IN=~/svn.icu/trunk/src/source/data/in +- ~/svn.icu/trunk/dbg$ UNIDATA=~/svn.icu/trunk/src/source/data/unidata +- ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfc.nrm -s $UNIDATA/norm2 nfc.txt +- ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt +- ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/nfkc_cf.nrm -s $UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt +- ~/svn.icu/trunk/dbg$ bin/gennorm2 -o $SRC_DATA_IN/uts46.nrm -s $UNIDATA/norm2 nfc.txt uts46.txt + +* build ICU (make install) +* build Unicode tools using CMake+make + +* new way to call genuca (makeuca.sh was deleted) +- ~/svn.icu/tools/trunk/dbg/unicode$ c/genuca/genuca -i ~/svn.icu/trunk/dbg/data/out/build/icudt49l ~/svn.icu/trunk/src + +---------------------------------------------------------------------------- *** + +Unicode 6.1 update + +*** ICU Trac + +- ticket 8995 final update to Unicode 6.1 +- ticket 8994 regenerate source/layout/CanonData.cpp + +- ticket 8961 support Unicode "Age" value *names* +- ticket 8963 support multiple character name aliases & types + +- ticket 8827 "update ICU to Unicode 6.1" +- C++ branches/markus/uni61 at r30864 from trunk at r30843 +- Java branches/markus/uni61 at r30865 from trunk at r30863 + +*** Unicode version numbers +- makedata.mak +- uchar.h + (configure.in & configure: have been modified to extract the version from uchar.h) +- com.ibm.icu.util.VersionInfo +- icutools/unicode/makedefs.sh + + also review & update other definitions in that file, + e.g. the ICU version in this path: BLD_DATA_FILES=$ICU_BLD/data/out/build/icudt49l + +*** data files & enums & parser code + +* file preparation + +~/svn.icu/tools/trunk/src/unicode/c/genprops/misc$ ./ucdcopy.py ~/uni61/20111205/ucd ~/uni61/processed +- This prepares both unidata and testdata files in respective output subfolders. +- Check test file diffs for previously commented-out, known-failing data lines; + probably need to keep those commented out. + +* PropertyValueAliases.txt changes +- 11 new block names: + Arabic_Extended_A + Arabic_Mathematical_Alphabetic_Symbols + Chakma + Meetei_Mayek_Extensions + Meroitic_Cursive + Meroitic_Hieroglyphs + Miao + Sharada + Sora_Sompeng + Sundanese_Supplement + Takri + -> add to uchar.h + -> add to UCharacter.UnicodeBlock IDs + Eclipse find UBLOCK_([^ ]+) = ([0-9]+), (/.+) + replace public static final int \1_ID = \2; \3 + -> add to UCharacter.UnicodeBlock objects + Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) + replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 +- 1 new Joining_Group (jg) value: + Rohingya_Yeh + -> uchar.h & UCharacter.JoiningGroup +- 2 new Line_Break (lb) values: + CJ=Conditional_Japanese_Starter + HL=Hebrew_Letter + -> uchar.h & UCharacter.LineBreak +- 7 new scripts: + sc ; Cakm ; Chakma + sc ; Merc ; Meroitic_Cursive + sc ; Mero ; Meroitic_Hieroglyphs + sc ; Plrd ; Miao + sc ; Shrd ; Sharada + sc ; Sora ; Sora_Sompeng + sc ; Takr ; Takri + -> remove these from SyntheticPropertyValueAliases.txt + -> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java +- 2 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html + (added 2011-06-21) + Khoj 322 Khojki + Tirh 326 Tirhuta + and another one added 2011-12-09 + Hluw 080 Anatolian Hieroglyphs (Luwian Hieroglyphs, Hittite Hieroglyphs) + -> uscript.h + -> com.ibm.icu.lang.UScript + find USCRIPT_([^ ]+) *= ([0-9]+),(.+) + replace public static final int \1 = \2;\3 + -> SyntheticPropertyValueAliases.txt + -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java + +* UnicodeData.txt changes +- the last Unihan code point changes from U+9FCB to U+9FCC + search for both 9FCB (end) and 9FCC (limit) (regex 9FC[BC], case-insensitive) + + do change gennames.c + + do change swapCJK() in ucol.cpp & ImplicitCEGenerator.java + +* DerivedBidiClass.txt changes +- 2 new default-AL blocks: +# Arabic Extended-A: U+08A0 - U+08FF (was default-R) +# Arabic Mathematical Alphabetic Symbols: +# U+1EE00 - U+1EEFF (was default-R) +- 2 new default-R blocks: +# Meroitic Hieroglyphs: +# U+10980 - U+1099F +# Meroitic Cursive: U+109A0 - U+109FF + -> should be picked up by the explicit data in the file + +* NameAliases.txt changes +- from + # Each line has two fields + # First field: Code point + # Second field: Alias +- to + # Each line has three fields, as described here: + # + # First field: Code point + # Second field: Alias + # Third field: Type +- Also, the file previously allowed multiple aliases but only now does it + actually provide multiple, even multiple of the same type. For example, + FEFF;BYTE ORDER MARK;alternate + FEFF;BOM;abbreviation + FEFF;ZWNBSP;abbreviation +- This breaks our gennames parser, unames.icu data structure, and API. + Fix gennames to only pick up "correction" aliases. + New ticket #8963 for further changes. + +* run genpname/preparse.pl (on Linux) + + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname + + make sure that data.h is writable + + perl preparse.pl ~/svn.icu/trunk/src > out.txt + + preparse.pl shows no errors, out.txt Info and Warning lines look ok + +* build ICU (make install) + so that the tools build can pick up the new definitions from the installed header files. +* build Unicode tools (at least genpname) using CMake+make + +* run genpname + (builds both pnames.icu and propname_data.h) +- ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in +- ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource + +* build ICU (make install) +* build Unicode tools using CMake+make + +* update source/data/unidata/norm2/nfkc_cf.txt +- follow the instructions in nfkc_cf.txt for updating it from DerivedNormalizationProps.txt + +* update source/data/unidata/norm2/uts46.txt +- download http://www.unicode.org/Public/idna/6.1.0/IdnaMappingTable.txt + to ~/svn.icu/tools/trunk/src/unicode/py +- adjust idna2nrm.py to remove "; NV8": For UTS #46, we do not care about "not valid in IDNA2008". +- ~/svn.icu/tools/trunk/src/unicode/py$ ./idna2nrm.py +- ~/svn.icu/tools/trunk/src/unicode/py$ cp uts46.txt ~/svn.icu/trunk/src/source/data/unidata/norm2 + +* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to + sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) +- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters +- Unicode 6.0..6.1: U+2260, U+226E, U+226F +- nothing new in 6.1, no test file to update + +* generate core properties data files +- in initial bootstrapping, change the UCA version + in source/data/unidata/FractionalUCA.txt to match the new Unicode version +- ~/svn.icu/tools/trunk/src/unicode$ ./makeprops.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU & tools + + if genrb fails to build coll/root.res with an U_INVALID_FORMAT_ERROR, + check if the UCA version in FractionalUCA.txt matches the new Unicode version + (see step above) +- run makeuca.sh so that genuca picks up the new case mappings and nfc.nrm: + ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU & tools + +* update Java data files +- refresh just the UCD-related files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir /tmp/icu4j +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + output: + ... + Unicode .icu files built to ./out/build/icudt49l + mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt49b + mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt49b + echo pnames.icu ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt + LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt49l.dat ./out/icu4j/icudt49b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt49l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt49b + mv ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt49b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt49b" + jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt49b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data + jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt49b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data + make[1]: Leaving directory `/home/mscherer/svn.icu/trunk/bld/data' +- copy the big-endian Unicode data files to another location, + separate from the other data files + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/brkitr + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt49b + ~/svn.icu/trunk/bld/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/cnvalias.icu + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt49b + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/brkitr +- refresh ICU4J + ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt49b + +* refresh Java test .txt files +- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode + +* test ICU so far, fix test code where necessary +- temporarily ignore collation issues that look like UCA/UCD mismatches, + until UCA data is updated + +* UCA + +- get output from Mark's tools; look in + http://www.unicode.org/Public/UCA/6.1.0/CollationAuxiliary-.txt +- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt + (note removing the underscore before "Rules") +- update (ICU)/source/test/testdata/CollationTest_*.txt + and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt + with output from Mark's Unicode tools (..._CLDR_..._SHORT.txt) +- check test file diffs for previously commented-out, known-failing data lines; + probably need to keep those commented out +- check FractionalUCA.txt for manual changes of lead bytes from IMPLICIT to Hani +- run makeuca.sh: + ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU4C +- refresh ICU4J collation data: + (subset of instructions above for properties data refresh, except copies all coll/*) + ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + ~/svn.icu/trunk/bld$ mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt49b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt49b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt49b +- run all tests with the *_SHORT.txt or the full files (the full ones have comments, useful for debugging) +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +* When refreshing all of ICU4J data from ICU4C +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data +or +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install + +*** LayoutEngine script information + +(For details see the Unicode 5.2 change log below.) + +* Run icu4j-tools: com.ibm.icu.dev.tool.layout.ScriptNameBuilder. + This generates LEScripts.h, LELanguages.h, ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp + in the working directory. + (It also generates ScriptRunData.cpp, which is no longer needed.) + + The generated files have a current copyright date and "@draft" statement. + +- diff current /source/layout files vs. generated ones + ~/svn.icu4j/trunk/src$ kdiff3 ~/svn.icu/trunk/src/source/layout tools/misc/src/com/ibm/icu/dev/tool/layout + review and manually merge desired changes; + fix gratuitous changes, incorrect @draft and missing aliases; + Unicode-derived script codes should be "born stable" like constants in uchar.h, uscript.h etc. +- if you just copy the above files, then + fix mixed line endings, review the diffs as above and restore changes to API tags etc.; + manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h + +*** merge the Unicode update branches back onto the trunk +- do not merge the icudata.jar and testdata.jar, + instead rebuild them from merged & tested ICU4C + +---------------------------------------------------------------------------- *** + +ICU 4.8 (no Unicode update, just new script codes) + +* 9 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html + (added 2010-12-21) + Afak 439 Afaka + Jurc 510 Jurchen + Mroo 199 Mro, Mru + Nshu 499 Nüshu + Shrd 319 Sharada, Śāradā + Sora 398 Sora Sompeng + Takr 321 Takri, Ṭākrī, Ṭāṅkrī + Tang 520 Tangut + Wole 480 Woleai + -> uscript.h + -> com.ibm.icu.lang.UScript + find USCRIPT_([^ ]+) *= ([0-9]+),(.+) + replace public static final int \1 = \2;\3 + -> genpname/SyntheticPropertyValueAliases.txt + -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java + +* run genpname/preparse.pl (on Linux) + + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname + + make sure that data.h is writable + + perl preparse.pl ~/svn.icu/trunk/src > out.txt + + preparse.pl shows no errors, out.txt Info and Warning lines look ok + +* rebuild Unicode tools (at least genpname) using make +- You might first need to "make install" ICU so that the tools build can pick + up the new definitions from the installed header files. + +* run genpname + (builds both pnames.icu and propname_data.h) +- ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in +- ~/svn.icu/tools/trunk/bld/unicode/c$ genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource +- rebuild ICU & tools + +* run genprops +- ~/svn.icu/tools/trunk/bld/unicode/c$ genprops/genprops -d ~/svn.icu/trunk/src/source/data/in -s ~/svn.icu/trunk/src/source/data/unidata -i ~/svn.icu/trunk/dbg/data/out/build/icudt48l -u 6.0 +- ~/svn.icu/tools/trunk/bld/unicode/c$ genprops/genprops -d ~/svn.icu/trunk/src/source/common --csource -s ~/svn.icu/trunk/src/source/data/unidata -i ~/svn.icu/trunk/dbg/data/out/build/icudt48l -u 6.0 +- rebuild ICU & tools + +* update Java data files +- refresh just the UCD-related files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir /tmp/icu4j +- ~/svn.icu/trunk/dbg$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- copy the big-endian Unicode data files to another location, + separate from the other data files + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt48b + ~/svn.icu/trunk/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt48b/pnames.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt48b + ~/svn.icu/trunk/dbg/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt48b/uprops.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt48b +- refresh ICU4J + ~/svn.icu/trunk/dbg/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt48b + +* should have updated the layout engine script codes but forgot + +---------------------------------------------------------------------------- *** + +Unicode 6.0 update + +*** related ICU Trac tickets + +7264 Unicode 6.0 Update + +*** Unicode version numbers +- makedata.mak +- uchar.h + (configure.in & configure: have been modified to extract the version from uchar.h) +- com.ibm.icu.util.VersionInfo + +*** data files & enums & parser code + +* file preparation + +~/svn.icu/tools/trunk/src/unicode/c/genprops/misc$ ./ucdcopy.py ~/uni60/20100720/ucd ~/uni60/processed +- This now prepares both unidata and testdata files in respective output subfolders. + +* PropertyAliases.txt changes +- new Script_Extensions property defined in the new ScriptExtensions.txt file + but not listed in PropertyAliases.txt; reported to unicode.org; + -> added to tools/trunk/src/unicode/c/genpname/SyntheticPropertyAliases.txt + scx; Script_Extensions + -> uchar.h with new UProperty section + -> com.ibm.icu.lang.UProperty, parallel with uchar.h + +* PropertyValueAliases.txt changes +- 12 new block names: + Alchemical_Symbols + Bamum_Supplement + Batak + Brahmi + CJK_Unified_Ideographs_Extension_D + Emoticons + Ethiopic_Extended_A + Kana_Supplement + Mandaic + Miscellaneous_Symbols_And_Pictographs + Playing_Cards + Transport_And_Map_Symbols + -> add to uchar.h + -> add to UCharacter.UnicodeBlock + Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) + replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 +- Joining_Group (jg) values: + Teh_Marbuta_Goal becomes the new canonical value for the old Hamza_On_Heh_Goal which becomes an alias + -> uchar.h & UCharacter.JoiningGroup +- 3 new scripts: + sc ; Batk ; Batak + sc ; Brah ; Brahmi + sc ; Mand ; Mandaic + -> remove these from SyntheticPropertyValueAliases.txt + -> add alias USCRIPT_MANDAIC to USCRIPT_MANDAEAN + -> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java +- 13 new script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html + (added 2009-11-11..2010-07-18) + Bass 259 Bassa Vah + Dupl 755 Duployan shortand + Elba 226 Elbasan + Gran 343 Grantha + Kpel 436 Kpelle + Loma 437 Loma + Mend 438 Mende + Merc 101 Meroitic Cursive + Narb 106 Old North Arabian + Nbat 159 Nabataean + Palm 126 Palmyrene + Sind 318 Sindhi + Wara 262 Warang Citi + -> uscript.h + -> com.ibm.icu.lang.UScript + find USCRIPT_([^ ]+) *= ([0-9]+),(.+) + replace public static final int \1 = \2;\3 + -> SyntheticPropertyValueAliases.txt + -> add to expectedLong and expectedShort names in cintltst/cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java +- ISO 15924 name change + Mero 100 Meroitic Hieroglyphs (was Meroitic) + -> add new alias USCRIPT_MEROITIC_HIEROGLYPHS to USCRIPT_MEROITIC +- property value alias added for Cham, was already moved out of SyntheticPropertyValueAliases.txt + +* UnicodeData.txt changes +- new CJK block: + 2B740;;Lo;0;L;;;;;N;;;;; + 2B81D;;Lo;0;L;;;;;N;;;;; + -> add to tools/trunk/src/unicode/c/gennames/gennames.c, with new ucdVersion + +* build Unicode tools using CMake+make + +* run genpname/preparse.pl (on Linux) + + cd ~/svn.icu/tools/trunk/src/unicode/c/genpname + + make sure that data.h is writable + + perl preparse.pl ~/svn.icu/trunk/src > out.txt + + preparse.pl shows no errors, out.txt Info and Warning lines look ok + +* rebuild Unicode tools (at least genpname) using make +- You might first need to "make install" ICU so that the tools build can pick + up the new definitions from the installed header files. + +* run genpname +- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in +- rebuild ICU & tools + +* update source/data/unidata/norm2/nfkc_cf.txt +- follow the instructions in nfkc_cf.txt for updating it from DerivedNormalizationProps.txt + +* update source/data/unidata/norm2/uts46.txt +- download http://www.unicode.org/Public/idna/6.0.0/IdnaMappingTable.txt + to ~/svn.icu/tools/trunk/src/unicode/py +- adjust idna2nrm.py to handle new disallowed_STD3_valid and disallowed_STD3_mapped values +- ~/svn.icu/tools/trunk/src/unicode/py$ ./idna2nrm.py +- ~/svn.icu/tools/trunk/src/unicode/py$ cp uts46.txt ~/svn.icu/trunk/src/source/data/unidata/norm2 + +* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to + sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) +- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters +- Unicode 6.0: U+2260, U+226E, U+226F + +* generate core properties data files +- ~/svn.icu/tools/trunk/src/unicode$ ./makeprops.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU & tools +- run makeuca.sh so that genuca picks up the new nfc.nrm: + ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU & tools + +* implement new Script_Extensions property (provisional) +- parser & generator: genprops & uprops.icu +- uscript.h, uprops.h, uchar.c, uniset_props.cpp and others, plus cintltst/cucdapi.c & intltest/usettest.cpp +- UScript.java, UCharacterProperty.java, UnicodeSet.java, TestUScript.java, UnicodeSetTest.java + +* switch ubidi.icu, ucase.icu and uprops.icu from UTrie to UTrie2 +- (one-time change) +- genbidi/gencase/genprops tools changes +- re-run makeprops.sh (see above) +- UCharacterProperty.java, UCharacterTypeIterator.java, + UBiDiProps.java, UCaseProps.java, and several others with minor changes; + UCharacterPropertyReader.java deleted and its code folded into UCharacterProperty.java + +* update Java data files +- refresh just the UCD-related files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir /tmp/icu4j +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + output: + ... + Unicode .icu files built to ./out/build/icudt45l + mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt45b + echo ubidi.icu ucase.icu uprops.icu > ./out/icu4j/add.txt + LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt45l.dat ./out/icu4j/icudt45b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt45l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt45b + jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt45b + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data +- copy the big-endian Unicode data files to another location, + separate from the other data files + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b + ~/svn.icu/trunk/bld/data/out/icu4j$ rm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/cnvalias.icu + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/icudt45b + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/*.icu /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/brkitr +- refresh ICU4J + ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b + +* refresh Java test .txt files +- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode + +* un-hardcode normalization skippable (NF*_Inert) test data +- removes one manual step from the Unicode upgrade, and removes dependency on one of Mark's tools + +* copy updated break iterator test files +- now handled by early ucdcopy.py and + copying the uni60/processed/testdata files to ~/svn.icu/trunk/src/source/test/testdata + (old instructions: + copy from (Unicode 6.0)/ucd/auxiliary/*BreakTest-6....txt + to ~/svn.icu/trunk/src/source/test/testdata) +- they are not used in ICU4J + +* UCA + +- get output from Mark's tools; look in + http://www.unicode.org/~book/incoming/mark/uca6.0.0/ + http://www.macchiato.com/unicode/utc/additional-uca-files + http://www.unicode.org/Public/UCA/6.0.0/ + http://www.unicode.org/~mdavis/uca/ +- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt +- update Han-implicit ranges for new CJK extensions: + swapCJK() in ucol.cpp & ImplicitCEGenerator.java +- genuca: allow bytes 02 for U+FFFE, new merge-sort character; + do not add it into invuca so that tailoring primary-after an ignorable works +- genuca: permit space between [variable top] bytes +- ucol.cpp: treat noncharacters like unassigned rather than ignorable +- run makeuca.sh: + ~/svn.icu/tools/trunk/src/unicode$ ./makeuca.sh ~/svn.icu/trunk/src ~/svn.icu/trunk/bld +- rebuild ICU4C +- refresh ICU4J collation data: + (subset of instructions above for properties data refresh, except copies all coll/*) + ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ cp com/ibm/icu/impl/data/icudt45b/coll/* /tmp/icu4j/com/ibm/icu/impl/data/icudt45b/coll + ~/svn.icu/trunk/bld/data/out/icu4j$ jar uf ~/svn.icu4j/trunk/src/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/icudt45b +- update (ICU)/source/test/testdata/CollationTest_*.txt + and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt + with output from Mark's Unicode tools +- run all tests with the *_SHORT.txt or the full files (the full ones have comments) +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +* When refreshing all of ICU4J data from ICU4C +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- cp /tmp/icu4j/main/shared/data/icudata.jar ~/svn.icu4j/trunk/src/main/shared/data +or +- ~/svn.icu/trunk/bld$ make ICU4J_ROOT=~/svn.icu4j/trunk/src icu4j-data-install + +*** LayoutEngine script information + +(For details see the Unicode 5.2 change log below.) + +* Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguages.h, +ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (It also generates +ScriptRunData.cpp, which is no longer needed.) + +The generated files have a current copyright date and "@draft" statement. + +* copy the above files into /source/layout, replacing the old files. +* fix mixed line endings +* review the diffs and fix incorrect @draft and missing aliases; + Unicode-derived script codes should be "born stable" like constants in uchar.h, uscript.h etc. +* manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h + +---------------------------------------------------------------------------- *** + +Unicode 5.2 update + +*** related ICU Trac tickets + +7084 Unicode 5.2 + +7167 verify collation bytes +7235 Java test NAME_ALIAS +7236 Java DerivedCoreProperties.txt test +7237 Java BidiTest.txt +7238 UTrie2 in core unidata +7239 test for tailoring gaps +7240 Java fix CollationMiscTest +7243 update layout engine for Unicode 5.2 + +*** Unicode version numbers +- makedata.mak +- uchar.h +- configure.in & configure +- update ucdVersion in gennames.c if an algorithmic range changes + +*** data files & enums & parser code + +* file preparation + +python source\tools\genprops\misc\ucdcopy.py "C:\Documents and Settings\mscherer\My Documents\unicode\ucd\5.2.0" C:\svn\icuproj\icu\trunk\source\data\unidata +- includes finding files regardless of version numbers, + copying them, and performing the equivalent processing of the + ucdstrip and ucdmerge tools on the desired set of files + +* notes on changes +- PropertyAliases.txt + moved from numeric to enumerated: + ccc ; Canonical_Combining_Class + new string properties: + NFKC_CF ; NFKC_Casefold + Name_Alias; Name_Alias + new binary properties: + Cased ; Cased + CI ; Case_Ignorable + CWCF ; Changes_When_Casefolded + CWCM ; Changes_When_Casemapped + CWKCF ; Changes_When_NFKC_Casefolded + CWL ; Changes_When_Lowercased + CWT ; Changes_When_Titlecased + CWU ; Changes_When_Uppercased + new CJK Unihan properties (not supported by ICU) +- PropertyValueAliases.txt + new block names + new scripts + one script code change: + sc ; Qaai ; Inherited + -> + sc ; Zinh ; Inherited ; Qaai + new Line_Break (lb) value: + lb ; CP ; Close_Parenthesis + new Joining_Group (jg) values: Farsi_Yeh, Nya + other new values: + ccc; 214; ATA ; Attached_Above +- DerivedBidiClass.txt + new default-R range: U+1E800 - U+1EFFF +- UnicodeData.txt + all of the ISO comments are gone + new CJK block end: + 9FC3; -> 9FCB; + new CJK block: + 2A700;;Lo;0;L;;;;;N;;;;; + 2B734;;Lo;0;L;;;;;N;;;;; + +* genpname +- run preparse.pl + + cd \svn\icuproj\icu\trunk\source\tools\genpname + + make sure that data.h is writable + + perl preparse.pl \svn\icuproj\icu\trunk > out.txt + + preparse.pl complains with errors like the following: + Error: sc:Egyp already set to Egyptian_Hieroglyphs, cannot set to Egyp at preparse.pl line 1322, line 34. + This is because ICU 4.0 had scripts from ISO 15924 which are now + added to Unicode 5.2, and the Perl script shows a conflict between SyntheticPropertyValueAliases.txt + and PropertyValueAliases.txt. + -> Removed duplicate script entries from SyntheticPropertyValueAliases.txt: + Egyp, Java, Lana, Mtei, Orkh, Armi, Avst, Kthi, Phli, Prti, Samr, Tavt + + preparse.pl complains with errors about block names missing from uchar.h; add them + +* uchar.h & uscript.h & uprops.h & uprops.c & genprops +- new block & script values + + 26 new blocks + copy new blocks from Blocks.txt + MS VC++ 2008 regular expression: + find "^{[0-9A-F]+}\.\.{[0-9A-F]+}; {[A-Z].+}$" + replace with " UBLOCK_\3 = 172, /*[\1]*/" + + several new script values already added in ICU 4.0 for ISO 15924 coverage + (removed from SyntheticPropertyValueAliases.txt, see genpname notes above) + + 3 new script values added for ISO 15924 and Unicode 5.2 coverage + + 1 new script value added for ISO 15924 coverage (not in Unicode 5.2) + (added to SyntheticPropertyValueAliases.txt) +- new Joining Group (JG) values: Farsi_Yeh, Nya +- new Line_Break (lb) value: + lb ; CP ; Close_Parenthesis + +* hardcoded Unihan range end/limit +- Unihan range end moves from 9FC3 to 9FCB + search for both 9FC3 (end) and 9FC4 (limit) (regex 9FC[34], case-insensitive) + + do change gennames.c + +* Compare definitions of new binary properties with what we used to use + in algorithms, to see if the definitions changed. +- Verified that definitions for Cased and Case_Ignorable are unchanged. + The gencase tool now parses the newly public Case_Ignorable values + in case the definition changes in the future. + +* uchar.c & uprops.h & uprops.c & genprops +- new numeric values that didn't exist in Unicode data before: + 1/7, 1/9, 1/10, 3/10, 1/16, 3/16 + the ones with denominators >9 cannot be supported by uprops.icu formatVersion 5, + therefore redesign the encoding of numeric types and values for formatVersion 6; + design for simple numbers up to at least 144 ("one gross"), + large values up to at least 10^20, + and fractions with numerators -1..17 and denominators 1..16 + to cover current and expected future values + (e.g., more Han numeric values, Meroitic twelfths) + +* reimplement Hangul_Syllable_Type for new Jamo characters +- the old code assumed that all Jamo characters are in the 11xx block +- Unicode 5.2 fills holes there and adds new Jamo characters in + A960..A97F; Hangul Jamo Extended-A + and in + D7B0..D7FF; Hangul Jamo Extended-B +- Hangul_Syllable_Type can be trivially derived from a subset of + Grapheme_Cluster_Break values + +* build Unicode data source code for hardcoding core data +C:\svn\icuproj\icu\trunk\source\data>NMAKE /f makedata.mak ICUMAKE=\svn\icuproj\icu\trunk\source\data\ CFG=x86\release uni-core-data + +ICU data make path is \svn\icuproj\icu\trunk\source\data\ +ICU root path is \svn\icuproj\icu\trunk +Information: cannot find "ucmlocal.mk". Not building user-additional converter files. +Information: cannot find "brklocal.mk". Not building user-additional break iterator files. +Information: cannot find "reslocal.mk". Not building user-additional resource bundle files. +Information: cannot find "collocal.mk". Not building user-additional resource bundle files. +Information: cannot find "rbnflocal.mk". Not building user-additional resource bundle files. +Information: cannot find "trnslocal.mk". Not building user-additional transliterator files. +Information: cannot find "misclocal.mk". Not building user-additional miscellaenous files. +Information: cannot find "spreplocal.mk". Not building user-additional stringprep files. +Creating data file for Unicode Property Names +Creating data file for Unicode Character Properties +Creating data file for Unicode Case Mapping Properties +Creating data file for Unicode BiDi/Shaping Properties +Creating data file for Unicode Normalization +Unicode .icu files built to "\svn\icuproj\icu\trunk\source\data\out\build\icudt43l" +Unicode .c source files built to "\svn\icuproj\icu\trunk\source\data\out\tmp" + +- copy the .c source files to C:\svn\icuproj\icu\trunk\source\common + and rebuild the common library + +*** UCA + +- update FractionalUCA.txt with new canonical closure (output from Mark's Unicode tools) +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt from Mark's Unicode tools +- update source/test/testdata/CollationTest_*.txt with output from Mark's Unicode tools +[ Begin obsolete instructions: + Starting with UCA 5.2, we use the CollationTest_*_SHORT.txt files not the *_STUB.txt files. + - generate the source/test/testdata/CollationTest_*_STUB.txt files via source/tools/genuca/genteststub.py + on Windows: + python C:\svn\icuproj\icu\trunk\source\tools\genuca\genteststub.py CollationTest_NON_IGNORABLE_SHORT.txt CollationTest_NON_IGNORABLE_STUB.txt + python C:\svn\icuproj\icu\trunk\source\tools\genuca\genteststub.py CollationTest_SHIFTED_SHORT.txt CollationTest_SHIFTED_STUB.txt + End obsolete instructions] +- run all tests with the *_SHORT.txt or the full files (the full ones have comments) + not just the *_STUB.txt files +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +*** Implement Cased & Case_Ignorable properties +- via UProperty; call ucase.h functions ucase_getType() and ucase_getTypeOrIgnorable() +- Problem: These properties should be disjoint, but aren't +- UTC 2009nov decision: skip all Case_Ignorable regardless of whether they are Cased or not +- change ucase.icu to be able to store any combination of Cased and Case_Ignorable + +*** Implement Changes_When_Xyz properties +- without stored data + +*** Implement Name_Alias property +- add it as another name field in unames.icu +- make it available via u_charName() and UCharNameChoice and +- consider it in u_charFromName() + +*** Break iterators + +* Update break iterator rules to new UAX versions and new property values +* Update source/test/testdata/Test.txt files from /ucd/auxiliary + +*** new BidiTest file +- review format and data +- copy BidiTest.txt to source/test/testdata +- write test code using this data +- fix ICU code where it fails the conformance test + +*** Java +- generally, find and update code corresponding to C/C++ +- UCharacter.UnicodeBlock constants: + a) add an _ID integer per new block, update COUNT + b) add a class instance per new block + Visual Studio regex: + find UBLOCK_{[^ ]+} = [0-9]+, {/.+} + replace with public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 +- CHAR_NAME_ALIAS -> UCharacter.getNameAlias() and getCharFromNameAlias() + +- port test changes to Java + +*** LayoutEngine script information + +(For comparison, see the Unicode 5.1 update: http://bugs.icu-project.org/trac/changeset/23833) + +* Run ICU4J com.ibm.icu.dev.tool.layout.ScriptNameBuilder. This generates LEScripts.h, LELanguages.h, +ScriptAndLanguageTags.h and ScriptAndLanguageTags.cpp in the working directory. (It also generates +ScriptRunData.cpp, which is no longer needed.) + +The generated files have a current copyright date and "@draft" statement. + +-> Eric Mader wrote in email on 20090930: + "I think the tool has been modified to update @draft to @stable for + older scripts and to add @draft for new scripts. + (I worked with an intern on this last year.) + You should check the output after you run it." + +* copy the above files into /source/layout, replacing the old files. +* fix mixed line endings +* review the diffs and fix incorrect @draft and missing aliases +* manually re-add the "Indic script xyz v.2" tags in ScriptAndLanguageTags.h + +Add new default entries to the indicClassTables array in /source/layout/IndicClassTables.cpp +and the complexTable array in /source/layoutex/ParagraphLayout.cpp. (This step should be automated...) + +-> Eric Mader wrote in email on 20090930: + "This is just a matter of making sure that all the per-script tables have + entries for any new scripts that were added. + If any new Indic characters were added, then the class tables in + IndicClassTables.cpp should be updated to reflect this. + John Emmons should know how to do this if it's required." + +* rebuild the layout and layoutex libraries. + +*** Documentation +- Update User Guide + + Jamo_Short_Name, sfc->scf, binary property value aliases + +---------------------------------------------------------------------------- *** + Unicode 5.1 update *** related ICU Trac tickets