ICU-62107.0.1.tar.gz

[apple/icu.git] / icuSources / data / translit / es_FONIPA_zh.txt
diff --git a/icuSources/data/translit/es_FONIPA_zh.txt b/icuSources/data/translit/es_FONIPA_zh.txt

index 1eb75b8c1d12ef72aa84c991cde48bad761bc494..e7798c936153d2763ab9f82f3e9b3108bbc356f0 100644 (file)
--- a/icuSources/data/translit/es_FONIPA_zh.txt
+++ b/icuSources/data/translit/es_FONIPA_zh.txt
@@ -1,15 +1,16 @@
-# ***************************************************************************
-# *
-# *  Copyright (C) 2004-2012, International Business Machines
-# *  Corporation; Unicode, Inc.; and others.  All Rights Reserved.
-# *
-# ***************************************************************************
+# © 2016 and later: Unicode, Inc. and others.
+# License & terms of use: http://www.unicode.org/copyright.html#License
+#
  # File: es_FONIPA_zh.txt
-# Generated from CLDR 
+# Generated from CLDR
  #
+
+# Tranforms Spanish to Mandarin Chinese. The input Spanish string must be in
+# phonemic IPA transcription (es_FONIPA); the output is in Simplified Chinese.
  $word_boundary = [-\ $];
-$vowel = [aeijouw]; # Vowels and glides
+$vowel = [aeijouw];       # Vowels and glides
  $not_vowel = [^$vowel];
+# First pass: Collapse phonetic distinctions not preserved in Mandarin.
  ð → | d;
  ɣ → | g;
  ŋ → | n;
@@ -29,19 +30,26 @@ uu → u ;
  [^dgktx] { ei\u032F → e ;
  [^-\ .$] { eu\u032F → eu ;
  [^-\ .$] { ou\u032F → o;
-[^j] { ui → wi ;
-[^$word_boundary] { m } [bp] → n; # GB/T 17693.5-2009, 5.3.2
-s[θs] → s; # GB/T 17693.5-2009, 5.3.4
-[^ʧ] { jo → io; # GB/T 17693.5-2009 表 1, 注 7
+[^j]     { ui → wi ;
+[^$word_boundary] { m } [bp] → n;  # GB/T 17693.5-2009, 5.3.2
+s[θs] → s;               # GB/T 17693.5-2009, 5.3.4
+[^ʧ] { jo → io;          # GB/T 17693.5-2009 表 1, 注 7
  ::Null;
-j } an $not_vowel → i ; # GB/T 17693.5-2009 表 1, 注 8
+j } an $not_vowel → i ;  # GB/T 17693.5-2009 表 1, 注 8
+# GB/T 17693.5-2009 表 1, 注 8 also says that <uai> should be treated as if
+# it was <u> plus <ai>.  This is not borne out by the observed data, which
+# suggests that <ua> plus <i> is the more appropriate choice in some
+# situations.
  [g.$] { wai\u032F → wai ;
  wai\u032F → uai\u032F ;
  [g.$] { wau\u032F → wau ;
  wau\u032F → uau\u032F ;
  jau\u032F → iau\u032F ;
-[^jw] { ao } [^n] → au\u032F ;
+# Even though "ao" is not a diphthong in Spanish, Mandarin treats it as one.
+[^jw] { ao } [^n]     → au\u032F ;
  [^jw] { ao } n $vowel → au\u032F ;
+# Main pass: Phoneme to Hanzi conversion.
+# This generally follows GB/T 17693.5-2009 表 1, unless otherwise noted.
  ::Null;
  '.' → ;
  ai\u032F → 艾 ;
@@ -66,9 +74,9 @@ bun } $not_vowel → 本 ;
  bu → 布 ;
  bwan } $not_vowel → 布安 ;
  bwa → 布阿 ;
-bwen } $not_vowel → 布恩 ; # Should be be 本, per GB/T 17693.5-2009 表 1.
+bwen } $not_vowel → 布恩 ;  # Should be be 本, per GB/T 17693.5-2009 表 1.
  bwe → 布埃 ;
-bwin } $not_vowel → 布因 ; # Nonstandard, but fits observed data.
+bwin } $not_vowel → 布因 ;  # Nonstandard, but fits observed data.
  bwi → 布伊 ;
  bwo → 博 ;
  b → 布 ;
@@ -145,6 +153,11 @@ fwen } $not_vowel → 丰 ;
  fwe → 富埃 ;
  fwi → 富伊 ;
  fwo → 福 ;
+# The choice of 弗 vs. 夫 sounds simple according to the GB/T standard, but the
+# data suggest otherwise.  Ideally, 弗 should occur at the beginning of a
+# morpheme (e.g. in "villafranca" 比利亚弗兰卡) and 夫 everywhere else.  Since
+# we don't have morpheme boundaries, we'll fudge it by writing 夫 at the end of
+# a word and 弗 everywhere else.
  f } $word_boundary → 夫 ;
  f → 弗 ;
  gai\u032F → 盖 ;
@@ -164,7 +177,7 @@ gon } $not_vowel → 贡 ;
  go → 戈 ;
  gun } $not_vowel → 贡 ;
  gu → 古 ;
-gwan } [$] → 古安 ; # Nonstandard, but fits observed data.
+gwan } [$] → 古安 ;        # Nonstandard, but fits observed data.
  gwan } $not_vowel → 关 ;
  gwa → 瓜 ;
  gwen } $not_vowel → 古恩 ;
@@ -280,7 +293,7 @@ mwan } $not_vowel → 穆安 ;
  mwa → 穆阿 ;
  mwen } $not_vowel → 门 ;
  mwe → 穆埃 ;
-mwin } $not_vowel → 穆因 ; # Nonstandard, but fits observed data.
+mwin } $not_vowel → 穆因 ;  # Nonstandard, but fits observed data.
  mwi → 穆伊 ;
  mwo → 莫 ;
  m → 姆 ;
@@ -410,6 +423,9 @@ tje → 铁 ;
  tju → 蒂乌 ;
  ton } $not_vowel → 通 ;
  to → 托 ;
+# The rules for /ts/ (tz in the orthography) are nonstandard and derived
+# entirely from the observed data.  They apply mostly to native toponyms
+# in Mexico.
  tsa → 察 ;
  tsen } $not_vowel → 岑 ;
  tse → 采 ;
@@ -421,7 +437,7 @@ tsu → 楚 ;
  ts → 茨 ;
  tun } $not_vowel → 通 ;
  tu → 图 ;
-twan } $not_vowel → 图安 ;
+twan } $not_vowel → 图安   ;
  twa → 图阿 ;
  twen } $not_vowel → 通 ;
  twe → 图埃 ;
@@ -444,7 +460,7 @@ t → 特 ;
  ʧju → 丘 ;
  ʧon } $not_vowel → 琼 ;
  ʧo → 乔 ;
-ʧun } $not_vowel → 琼 ; # Should be 春, per GB/T 17693.5-2009 表 1.
+ʧun } $not_vowel → 琼 ;  # Should be 春, per GB/T 17693.5-2009 表 1.
  ʧu → 丘 ;
  ʧwan } $not_vowel → 丘安 ;
  ʧwa → 丘阿 ;
@@ -461,7 +477,7 @@ wen } $not_vowel → 温 ;
  we → 韦 ;
  win } $not_vowel → 温 ;
  wi → 维 ;
-won } $not_vowel → 翁 ; # Unseen.
+won } $not_vowel → 翁 ;  # Unseen.
  wo → 沃 ;
  xai\u032F → 海 ;
  xan } $not_vowel → 汉 ;
@@ -487,12 +503,26 @@ xwe → 胡埃 ;
  xwi → 惠 ;
  xwo → 霍 ;
  x → 赫 ;
+# 尔 simplification pass.  The idea is to drop most occurences of 尔
+# corresponding to <r> (not to <l> or <ll>) from a word if there is another /l/
+# sound nearby.  There is a vague pattern like this in the data, but the details
+# remain to be determined.  At the moment, this does nothing, it just puts 尔 in
+# for every <r> in a syllable coda.
  ::Null;
  $r = [R利拉];
+#
+#
+# R } . $r → ;
+# R } .. $r → ;
+# R } ... $r → ;
+# R } .... $r → ;
  R → 尔 ;
+# Dong-nan-xi-hai pass.  Per GB/T 17693.5-2009 表 1, 注 4, replace confusing
+# characters at the beginning and end of a word.
  ::Null;
  $word_boundary { 东 → 栋 ;
  $word_boundary { 南 → 楠 ;
  $word_boundary { 西 → 锡 ;
  海 } $word_boundary → 亥 ;
  ::NFC;
+