1 # ***************************************************************************
3 # * Copyright (C) 2004-2016, International Business Machines
4 # * Corporation; Unicode, Inc.; and others. All Rights Reserved.
6 # ***************************************************************************
7 # File: Han_Spacedhan.txt
11 # Only intended for internal use
12 # Make sure Han are normalized, including characters that contain them.
13 # The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
14 # Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
15 :: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc;
16 :: fullwidth-halfwidth;
18 $terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
19 $initialPunct = [:Ps:][:Pi:];
20 # add space between any Han or terminal punctuation and letters, and
21 # between letters and Han or initial punct
22 [[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
23 [:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;
24 # remove spacing between ideographs and other letters
25 ← [:Ideographic:] { ' ' } [:Letter:] ;
26 ← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;