X-Git-Url: https://git.saurik.com/apple/icu.git/blobdiff_plain/51004dcb01e06fef634b61be77ed73dd61cb6db9..0f5d89e82340278ed3d7d50029f37cab2c41a57e:/icuSources/data/translit/es_FONIPA_zh.txt

diff --git a/icuSources/data/translit/es_FONIPA_zh.txt b/icuSources/data/translit/es_FONIPA_zh.txt
index 67dc66d4..e7798c93 100644
--- a/icuSources/data/translit/es_FONIPA_zh.txt
+++ b/icuSources/data/translit/es_FONIPA_zh.txt
@@ -1,15 +1,16 @@
-ï»¿# ***************************************************************************
-# *
-# *  Copyright (C) 2004-2013, International Business Machines
-# *  Corporation; Unicode, Inc.; and others.  All Rights Reserved.
-# *
-# ***************************************************************************
+ï»¿# Â© 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+#
 # File: es_FONIPA_zh.txt
-# Generated from CLDR 
+# Generated from CLDR
 #
+
+# Tranforms Spanish to Mandarin Chinese. The input Spanish string must be in
+# phonemic IPA transcription (es_FONIPA); the output is in Simplified Chinese.
 $word_boundary = [-\ $];
 $vowel = [aeijouw];       # Vowels and glides
 $not_vowel = [^$vowel];
+# First pass: Collapse phonetic distinctions not preserved in Mandarin.
 Ã° â | d;
 É£ â | g;
 Å â | n;
@@ -35,13 +36,20 @@ s[Î¸s] â s;               # GB/T 17693.5-2009, 5.3.4
 [^Ê§] { jo â io;          # GB/T 17693.5-2009 è¡¨ 1, æ³¨ 7
 ::Null;
 j } an $not_vowel â i ;  # GB/T 17693.5-2009 è¡¨ 1, æ³¨ 8
+# GB/T 17693.5-2009 è¡¨ 1, æ³¨ 8 also says that <uai> should be treated as if
+# it was <u> plus <ai>.  This is not borne out by the observed data, which
+# suggests that <ua> plus <i> is the more appropriate choice in some
+# situations.
 [g.$] { wai\u032F â wai ;
 wai\u032F â uai\u032F ;
 [g.$] { wau\u032F â wau ;
 wau\u032F â uau\u032F ;
 jau\u032F â iau\u032F ;
+# Even though "ao" is not a diphthong in Spanish, Mandarin treats it as one.
 [^jw] { ao } [^n]     â au\u032F ;
 [^jw] { ao } n $vowel â au\u032F ;
+# Main pass: Phoneme to Hanzi conversion.
+# This generally follows GB/T 17693.5-2009 è¡¨ 1, unless otherwise noted.
 ::Null;
 '.' â ;
 ai\u032F â è¾ ;
@@ -145,6 +153,11 @@ fwen } $not_vowel â ä¸° ;
 fwe â å¯å ;
 fwi â å¯ä¼ ;
 fwo â ç¦ ;
+# The choice of å¼ vs. å¤« sounds simple according to the GB/T standard, but the
+# data suggest otherwise.  Ideally, å¼ should occur at the beginning of a
+# morpheme (e.g. in "villafranca" æ¯å©äºå¼å°å¡) and å¤« everywhere else.  Since
+# we don't have morpheme boundaries, we'll fudge it by writing å¤« at the end of
+# a word and å¼ everywhere else.
 f } $word_boundary â å¤« ;
 f â å¼ ;
 gai\u032F â ç ;
@@ -410,6 +423,9 @@ tje â é ;
 tju â èä¹ ;
 ton } $not_vowel â é ;
 to â æ ;
+# The rules for /ts/ (tz in the orthography) are nonstandard and derived
+# entirely from the observed data.  They apply mostly to native toponyms
+# in Mexico.
 tsa â å¯ ;
 tsen } $not_vowel â å² ;
 tse â é ;
@@ -487,12 +503,26 @@ xwe â è¡å ;
 xwi â æ  ;
 xwo â é ;
 x â èµ« ;
+# å° simplification pass.  The idea is to drop most occurences of å°
+# corresponding to <r> (not to <l> or <ll>) from a word if there is another /l/
+# sound nearby.  There is a vague pattern like this in the data, but the details
+# remain to be determined.  At the moment, this does nothing, it just puts å° in
+# for every <r> in a syllable coda.
 ::Null;
 $r = [Rå©æ];
+#
+#
+# R } . $r â ;
+# R } .. $r â ;
+# R } ... $r â ;
+# R } .... $r â ;
 R â å° ;
+# Dong-nan-xi-hai pass.  Per GB/T 17693.5-2009 è¡¨ 1, æ³¨ 4, replace confusing
+# characters at the beginning and end of a word.
 ::Null;
 $word_boundary { ä¸ â æ  ;
 $word_boundary { å â æ¥  ;
 $word_boundary { è¥¿ â é¡ ;
 æµ· } $word_boundary â äº¥ ;
 ::NFC;
+