X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/46f4442e9a5a4f3b98b7c1083586332f6a8a99a4..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/translit/Han_Spacedhan.txt diff --git a/icuSources/data/translit/Han_Spacedhan.txt b/icuSources/data/translit/Han_Spacedhan.txt index 5535eba4..9428d4dd 100644 --- a/icuSources/data/translit/Han_Spacedhan.txt +++ b/icuSources/data/translit/Han_Spacedhan.txt @@ -1,17 +1,24 @@ -# *************************************************************************** -# * -# * Copyright (C) 2004-2008, International Business Machines -# * Corporation; Unicode, Inc.; and others. All Rights Reserved. -# * -# *************************************************************************** +# © 2016 and later: Unicode, Inc. and others. +# License & terms of use: http://www.unicode.org/copyright.html#License +# # File: Han_Spacedhan.txt -# Generated from CLDR +# Generated from CLDR # + +# Only intended for internal use +# Make sure Han are normalized, including characters that contain them. +# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:] +# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release! +:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; :: fullwidth-halfwidth; -。 > '.'; +。 → '.'; $terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]]; $initialPunct = [:Ps:][:Pi:]; -[[:Ideographic:] $terminalPunct] {} [:Letter:] > ' ' ; -[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] > ' ' ; -< [:Ideographic:] { ' ' } [:Letter:] ; -< [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; +# add space between any Han or terminal punctuation and letters, and +# between letters and Han or initial punct +[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ; +[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ; +# remove spacing between ideographs and other letters +← [:Ideographic:] { ' ' } [:Letter:] ; +← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ; +