X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/38fbf2fd31f5cd99b500914d6037b1d06b608645..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/unidata/changes.txt diff --git a/icuSources/data/unidata/changes.txt b/icuSources/data/unidata/changes.txt index ef717b12..8ace8ca2 100644 --- a/icuSources/data/unidata/changes.txt +++ b/icuSources/data/unidata/changes.txt @@ -49,6 +49,395 @@ For new script codes see http://www.unicode.org/iso15924/codechanges.html ---------------------------------------------------------------------------- *** +Unicode 11.0 update for ICU 62 + +http://www.unicode.org/versions/Unicode11.0.0/ +http://unicode.org/versions/beta-11.0.0.html +https://www.unicode.org/review/pri372/ +http://www.unicode.org/reports/uax-proposed-updates.html +http://www.unicode.org/reports/tr44/tr44-21.html + +* Command-line environment setup + +UNICODE_DATA=~/unidata/uni11/20180521 +CLDR_SRC=~/svn.cldr/uni +ICU_ROOT=~/svn.icu/uni +ICU_SRC=$ICU_ROOT/src +ICUDT=icudt61b +ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in +ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata +export LD_LIBRARY_PATH=$ICU_ROOT/dbg/icu4c/lib + +*** ICU Trac + +- ticket:13630: Unicode 11 +- ^/branches/markus/uni11 + +*** CLDR Trac + +- cldrbug 10978: Unicode 11 +- ^/branches/markus/uni11 + +*** Unicode version numbers +- makedata.mak +- uchar.h +- com.ibm.icu.util.VersionInfo +- com.ibm.icu.dev.test.lang.UCharacterTest.VERSION_ + +- Run ICU4C "configure" _after_ updating the Unicode version number in uchar.h + so that the makefiles see the new version number. + +*** data files & enums & parser code + +* download files +- mkdir -p $UNICODE_DATA +- download Unicode files into $UNICODE_DATA + + subfolders: emoji, idna, security, ucd, uca + + inside ucd: extract Unihan.zip to "here" (.../ucd/Unihan/*.txt), delete Unihan.zip + +* for manual diffs and for Unicode Tools input data updates: + remove version suffixes from the file names + ~$ unidata/desuffixucd.py $UNICODE_DATA + (see https://sites.google.com/site/unicodetools/inputdata) + +* process and/or copy files +- $ICU_SRC/tools/unicode$ py/preparseucd.py $UNICODE_DATA $ICU_SRC + + This writes files (especially ppucd.txt) to the ICU4C unidata and testdata subfolders. + + For debugging, and tweaking how ppucd.txt is written, + the tool has an --only_ppucd option: + py/preparseucd.py $UNICODE_DATA --only_ppucd path/to/ppucd/outputfile + +- cp $UNICODE_DATA/security/confusables.txt $ICU4C_UNIDATA + +* build ICU (make install) + so that the tools build can pick up the new definitions from the installed header files. + + $ICU_ROOT/dbg/icu4c$ echo;echo; make -j7 install > out.txt 2>&1 ; tail -n 30 out.txt ; date + +* preparseucd.py changes +- fix other errors + NameError: unknown property Extended_Pictographic + -> add Extended_Pictographic binary property + -> add new short names for all Emoji properties + +* new constants for new property values +- preparseucd.py error: + ValueError: missing uchar.h enum constants for some property values: + [(u'blk', set([u'Georgian_Ext', u'Hanifi_Rohingya', u'Medefaidrin', u'Sogdian', u'Makasar', + u'Old_Sogdian', u'Dogra', u'Gunjala_Gondi', u'Chess_Symbols', u'Mayan_Numerals', + u'Indic_Siyaq_Numbers'])), + (u'jg', set([u'Hanifi_Rohingya_Kinna_Ya', u'Hanifi_Rohingya_Pa'])), + (u'sc', set([u'Medf', u'Sogd', u'Dogr', u'Rohg', u'Maka', u'Sogo', u'Gong'])), + (u'GCB', set([u'LinkC', u'Virama'])), + (u'WB', set([u'WSegSpace']))] + = PropertyValueAliases.txt new property values (diff old & new .txt files) + blk; Chess_Symbols ; Chess_Symbols + blk; Dogra ; Dogra + blk; Georgian_Ext ; Georgian_Extended + blk; Gunjala_Gondi ; Gunjala_Gondi + blk; Hanifi_Rohingya ; Hanifi_Rohingya + blk; Indic_Siyaq_Numbers ; Indic_Siyaq_Numbers + blk; Makasar ; Makasar + blk; Mayan_Numerals ; Mayan_Numerals + blk; Medefaidrin ; Medefaidrin + blk; Old_Sogdian ; Old_Sogdian + blk; Sogdian ; Sogdian + -> add to uchar.h + use long property names for enum constants, + for the trailing comment get the block start code point: diff old & new Blocks.txt + -> add to UCharacter.UnicodeBlock IDs + Eclipse find UBLOCK_([^ ]+) = ([0-9]+), (/.+) + replace public static final int \1_ID = \2; \3 + -> add to UCharacter.UnicodeBlock objects + Eclipse find UBLOCK_([^ ]+) = [0-9]+, (/.+) + replace public static final UnicodeBlock \1 = new UnicodeBlock("\1", \1_ID); \2 + + GCB; LinkC ; LinkingConsonant + GCB; Virama ; Virama + -> uchar.h & UCharacter.GraphemeClusterBreak + -> these two later removed again: http://www.unicode.org/L2/L2018/18115.htm#155-A76 + + InSC; Consonant_Initial_Postfixed ; Consonant_Initial_Postfixed + -> ignore: ICU does not yet support this property + + jg ; Hanifi_Rohingya_Kinna_Ya ; Hanifi_Rohingya_Kinna_Ya + jg ; Hanifi_Rohingya_Pa ; Hanifi_Rohingya_Pa + -> uchar.h & UCharacter.JoiningGroup + + sc ; Dogr ; Dogra + sc ; Gong ; Gunjala_Gondi + sc ; Maka ; Makasar + sc ; Medf ; Medefaidrin + sc ; Rohg ; Hanifi_Rohingya + sc ; Sogd ; Sogdian + sc ; Sogo ; Old_Sogdian + -> uscript.h & com.ibm.icu.lang.UScript + -> Nushu had been added already + -> fix expectedLong names in cucdapi.c/TestUScriptCodeAPI() + and in com.ibm.icu.dev.test.lang.TestUScript.java + + WB ; WSegSpace ; WSegSpace + -> uchar.h & UCharacter.WordBreak + +* New short names for emoji properties +- see UTS #51 +- short names set in preparseucd.py + +* New properties +- boolean emoji property Extended_Pictographic + -> added in preparseucd.py + -> uchar.h & UProperty.java +- misc. property Equivalent_Unified_Ideograph (EqUIdeo) + as shown in PropertyValueAliases.txt + -> ignore for now + +* update Script metadata: SCRIPT_PROPS[] in uscript_props.cpp & UScript.ScriptMetadata + (not strictly necessary for NOT_ENCODED scripts) + $ICU_SRC/tools/unicode$ py/parsescriptmetadata.py $ICU_SRC/icu4c/source/common/unicode/uscript.h $CLDR_SRC/common/properties/scriptMetadata.txt + +* update spoof checker UnicodeSet initializers: + inclusionPat & recommendedPat in uspoof.cpp + INCLUSION & RECOMMENDED in SpoofChecker.java +- make sure that the Unicode Tools tree contains the latest security data files +- go to Unicode Tools org.unicode.text.tools.RecommendedSetGenerator +- update the hardcoded version number there in the DIRECTORY path +- run the tool (no special environment variables needed) +- copy & paste from the Console output into the .cpp & .java files + +* generate normalization data files + cd $ICU_ROOT/dbg/icu4c + bin/gennorm2 -o $ICU_SRC/icu4c/source/common/norm2_nfc_data.h -s $ICU4C_UNIDATA/norm2 nfc.txt --csource + bin/gennorm2 -o $ICU4C_DATA_IN/nfc.nrm -s $ICU4C_UNIDATA/norm2 nfc.txt + bin/gennorm2 -o $ICU4C_DATA_IN/nfkc.nrm -s $ICU4C_UNIDATA/norm2 nfc.txt nfkc.txt + bin/gennorm2 -o $ICU4C_DATA_IN/nfkc_cf.nrm -s $ICU4C_UNIDATA/norm2 nfc.txt nfkc.txt nfkc_cf.txt + bin/gennorm2 -o $ICU4C_DATA_IN/uts46.nrm -s $ICU4C_UNIDATA/norm2 nfc.txt uts46.txt + +* build ICU (make install) + so that the tools build can pick up the new definitions from the installed header files. + + $ICU_ROOT/dbg/icu4c$ echo;echo; make -j7 install > out.txt 2>&1 ; tail -n 30 out.txt ; date + +* build Unicode tools using CMake+make + +$ICU_SRC/tools/unicode/c/icudefs.txt: + +# Location (--prefix) of where ICU was installed. +set(ICU_INST_DIR /usr/local/google/home/mscherer/svn.icu/trunk/inst/icu4c) +# Location of the ICU4C source tree. +set(ICU4C_SRC_DIR /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c) + + $ICU_ROOT/dbg$ + mkdir -p tools/unicode/c + cd tools/unicode/c + + $ICU_ROOT/dbg/tools/unicode/c$ + cmake ../../../../src/tools/unicode/c + make + +* generate core properties data files + $ICU_ROOT/dbg/tools/unicode/c$ + genprops/genprops $ICU_SRC/icu4c + genuca/genuca --hanOrder implicit $ICU_SRC/icu4c + genuca/genuca --hanOrder radical-stroke $ICU_SRC/icu4c +- rebuild ICU (make install) & tools + +* Fix case props + genprops error: casepropsbuilder: too many exceptions words + genprops error: failure finalizing the data - U_BUFFER_OVERFLOW_ERROR +- With the addition of Georgian Mtavruli capital letters, + there are now too many simple case mappings with big mapping deltas + that yield uncompressible exceptions. +- Changing the data structure (now formatVersion 4), + adding one bit for no-simple-case-folding (for Cherokee), and + one optional slot for a big delta (for most faraway mappings), + together with another bit for whether that is negative. + This makes most Cherokee & Georgian etc. case mappings compressible, + reducing the number of exceptions words. +- Further changes to gain one more bit for the exceptions index, + for future growth. Details see casepropsbuilder.cpp. + +* update uts46test.cpp and UTS46Test.java if there are new characters that are equivalent to + sequences with non-LDH ASCII (that is, their decompositions contain '=' or similar) +- grep IdnaMappingTable.txt or uts46.txt for "disallowed_STD3_valid" on non-ASCII characters +- Unicode 6.0..11.0: U+2260, U+226E, U+226F +- nothing new in this Unicode version, no test file to update + +* run & fix ICU4C tests +- Andy handles RBBI & spoof check test failures + +- Errors in char.txt, word.txt, word_POSIX.txt like + createRuleBasedBreakIterator: ICU Error "U_BRK_RULE_EMPTY_SET" at line 46, column 16 + because \p{Grapheme_Cluster_Break = EBG} and \p{Word_Break = EBG} are empty. + -> Temporary(!) workaround: Add an arbitrary code point to these sets to make them + not empty, just to get ICU building. + -> Intermediate workaround: Remove $E_Base_GAZ and other now-unused variables + and properties together with the rules that used them (GB 10, WB 14). + -> Andy adjusts the rule sets further to sync with + Unicode 11 grapheme, word, and line break spec changes. + +* collation: CLDR collation root, UCA DUCET + +- UCA DUCET goes into Mark's Unicode tools, see + https://sites.google.com/site/unicodetools/home#TOC-UCA + diff the main mapping file, look for bad changes + (for example, more bytes per weight for common characters) + ~/svn.unitools/trunk$ sed -r -f ~/svn.cldr/uni/tools/scripts/uca/blankweights.sed ../Generated/uca/11.0.0/CollationAuxiliary/FractionalUCA.txt > ../frac-11.txt + ~/svn.unitools/trunk$ meld ../frac-10.txt ../frac-11.txt + +- CLDR root data files are checked into $CLDR_SRC/common/uca/ + cp (Unicode Tools UCA generated)/CollationAuxiliary/* $CLDR_SRC/common/uca/ + +- update source/data/unidata/FractionalUCA.txt with FractionalUCA_SHORT.txt + cp $CLDR_SRC/common/uca/FractionalUCA_SHORT.txt $ICU4C_UNIDATA/FractionalUCA.txt +- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt + cp $ICU4C_UNIDATA/UCARules.txt /tmp/UCARules-old.txt + (note removing the underscore before "Rules") + cp $CLDR_SRC/common/uca/UCA_Rules_SHORT.txt $ICU4C_UNIDATA/UCARules.txt +- restore TODO diffs in UCARules.txt + meld /tmp/UCARules-old.txt $ICU4C_UNIDATA/UCARules.txt +- update (ICU4C)/source/test/testdata/CollationTest_*.txt + and (ICU4J)/main/tests/collate/src/com/ibm/icu/dev/data/CollationTest_*.txt + from the CLDR root files (..._CLDR_..._SHORT.txt) + cp $CLDR_SRC/common/uca/CollationTest_CLDR_NON_IGNORABLE_SHORT.txt $ICU_SRC/icu4c/source/test/testdata/CollationTest_NON_IGNORABLE_SHORT.txt + cp $CLDR_SRC/common/uca/CollationTest_CLDR_SHIFTED_SHORT.txt $ICU_SRC/icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt + cp $ICU_SRC/icu4c/source/test/testdata/CollationTest_*.txt $ICU_SRC/icu4j/main/tests/collate/src/com/ibm/icu/dev/data +- if CLDR common/uca/unihan-index.txt changes, then update + CLDR common/collation/root.xml + and regenerate (or update in parallel) $ICU_SRC/icu4c/source/data/coll/root.txt + +- run genuca, see command line above; + deal with + Error: Unknown script for first-primary sample character U+1180B on line 28649 of /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c/source/data/unidata/FractionalUCA.txt: + FDD1 1180B; [71 CC 02, 05, 05] # Dogra first primary (compressible) + (add the character to genuca.cpp sampleCharsToScripts[]) + + look up the USCRIPT_ code for the new sample characters + (should be obvious from the comment in the error output) + + *add* mappings to sampleCharsToScripts[], do not replace them + (in case the script sample characters flip-flop) + + insert new scripts in DUCET script order, see the top_byte table + at the beginning of FractionalUCA.txt +- rebuild ICU4C + +* Unihan collators + https://sites.google.com/site/unicodetools/unihan +- run Unicode Tools + org.unicode.draft.GenerateUnihanCollators + with VM arguments + -ea + -DSVN_WORKSPACE=/usr/local/google/home/mscherer/svn.unitools/trunk + -DOTHER_WORKSPACE=/usr/local/google/home/mscherer/svn.unitools + -DUCD_DIR=/usr/local/google/home/mscherer/svn.unitools/trunk/data + -DCLDR_DIR=/usr/local/google/home/mscherer/svn.cldr/uni + -DUVERSION=11.0.0 +- run Unicode Tools + org.unicode.draft.GenerateUnihanCollatorFiles + with the same arguments +- check CLDR diffs + cd $CLDR_SRC + meld common/collation/zh.xml ../Generated/cldr/han/replace/zh.xml + meld common/transforms/Han-Latin.xml ../Generated/cldr/han/replace/Han-Latin.xml +- copy to CLDR + cd $CLDR_SRC + cp ../Generated/cldr/han/replace/zh.xml common/collation/zh.xml + cp ../Generated/cldr/han/replace/Han-Latin.xml common/transforms/Han-Latin.xml +- run CLDR unit tests, commit to CLDR +- generate ICU zh collation data: run CLDR + org.unicode.cldr.icu.NewLdml2IcuConverter + with program arguments + -t collation + -s /usr/local/google/home/mscherer/svn.cldr/uni/common/collation + -m /usr/local/google/home/mscherer/svn.cldr/uni/common/supplemental + -d /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c/source/data/coll + -p /usr/local/google/home/mscherer/svn.icu/uni/src/icu4c/source/data/xml/collation + zh + and VM arguments + -ea + -DCLDR_DIR=/usr/local/google/home/mscherer/svn.cldr/uni +- rebuild ICU4C + +* run & fix ICU4C tests, now with new CLDR collation root data +- run all tests with the collation test data *_SHORT.txt or the full files + (the full ones have comments, useful for debugging) +- note on intltest: if collate/UCAConformanceTest fails, then + utility/MultithreadTest/TestCollators will fail as well; + fix the conformance test before looking into the multi-thread test + +* update Java data files +- refresh just the UCD/UCA-related/derived files, just to be safe +- see (ICU4C)/source/data/icu4j-readme.txt +- mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install + output: + ... + Unicode .icu files built to ./out/build/icudt61l + echo timestamp > uni-core-data + mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt61b + mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt61b + echo pnames.icu uprops.icu ucase.icu ubidi.icu nfc.nrm > ./out/icu4j/add.txt + LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt61l.dat ./out/icu4j/icudt61b.dat -a ./out/icu4j/add.txt -s ./out/build/icudt61l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt61b + mv ./out/icu4j/"com/ibm/icu/impl/data/icudt61b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt61b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt61b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt61b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt61b" + jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt61b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data + jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt61b/ + mkdir -p /tmp/icu4j/main/shared/data + cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data + make[1]: Leaving directory '/usr/local/google/home/mscherer/svn.icu/uni/dbg/icu4c/data' +- copy the big-endian Unicode data files to another location, + separate from the other data files, + and then refresh ICU4J + cd $ICU_ROOT/dbg/icu4c/data/out/icu4j + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/coll + mkdir -p /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/brkitr + cp com/ibm/icu/impl/data/$ICUDT/confusables.cfu /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + cp com/ibm/icu/impl/data/$ICUDT/*.icu /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + rm /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/cnvalias.icu + cp com/ibm/icu/impl/data/$ICUDT/*.nrm /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT + cp com/ibm/icu/impl/data/$ICUDT/coll/* /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/coll + cp com/ibm/icu/impl/data/$ICUDT/brkitr/* /tmp/icu4j/com/ibm/icu/impl/data/$ICUDT/brkitr + jar uvf $ICU_SRC/icu4j/main/shared/data/icudata.jar -C /tmp/icu4j com/ibm/icu/impl/data/$ICUDT + +* When refreshing all of ICU4J data from ICU4C +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=/tmp/icu4j icu4j-data-install +- cp /tmp/icu4j/main/shared/data/icudata.jar $ICU_SRC/icu4j/main/shared/data +or +- $ICU_ROOT/dbg/icu4c$ make ICU4J_ROOT=$ICU_SRC/icu4j icu4j-data-install + +* update CollationFCD.java + + copy & paste the initializers of lcccIndex[] etc. from + ICU4C/source/i18n/collationfcd.cpp to + ICU4J/main/classes/collate/src/com/ibm/icu/impl/coll/CollationFCD.java + +* refresh Java test .txt files +- copy new .txt files into ICU4J's main/tests/core/src/com/ibm/icu/dev/data/unicode + cd $ICU_SRC/icu4c/source/data/unidata + cp confusables.txt confusablesWholeScript.txt NormalizationCorrections.txt NormalizationTest.txt SpecialCasing.txt UnicodeData.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + cd ../../test/testdata + cp BidiCharacterTest.txt BidiTest.txt IdnaTestV2.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + cp $UNICODE_DATA/ucd/CompositionExclusions.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode + +* run & fix ICU4J tests + +*** API additions +- send notice to icu-design about new born-@stable API (enum constants etc.) + +*** CLDR numbering systems +- look for new sets of decimal digits (gc=ND & nv=4) and add to CLDR + Unicode 11: using Unicode 11 CLDR ticket #10978 + rohg 10D30..10D39 Hanifi_Rohingya + gong 11DA0..11DA9 Gunjala_Gondi + Earlier: CLDR tickets specific to adding new numbering systems. + Unicode 10: http://unicode.org/cldr/trac/ticket/10219 + Unicode 9: http://unicode.org/cldr/trac/ticket/9692 + +*** merge the Unicode update branches back onto the trunk +- do not merge the icudata.jar and testdata.jar, + instead rebuild them from merged & tested ICU4C +- make sure that changes to Unicode tools are checked in: + http://www.unicode.org/utility/trac/log/trunk/unicodetools + +---------------------------------------------------------------------------- *** + Unicode 10.0 update for ICU 60 http://www.unicode.org/versions/Unicode10.0.0/