1 # ***************************************************************************
3 # * Copyright (C) 2004-2016, International Business Machines
4 # * Corporation; Unicode, Inc.; and others. All Rights Reserved.
6 # ***************************************************************************
7 # File: es_FONIPA_zh.txt
11 # Tranforms Spanish to Mandarin Chinese. The input Spanish string must be in
12 # phonemic IPA transcription (es_FONIPA); the output is in Simplified Chinese.
13 $word_boundary = [-\ $];
14 $vowel = [aeijouw]; # Vowels and glides
15 $not_vowel = [^$vowel];
16 # First pass: Collapse phonetic distinctions not preserved in Mandarin.
33 [^dgktx] { ei\u032F → e ;
34 [^-\ .$] { eu\u032F → eu ;
35 [^-\ .$] { ou\u032F → o;
37 [^$word_boundary] { m } [bp] → n; # GB/T 17693.5-2009, 5.3.2
38 s[θs] → s; # GB/T 17693.5-2009, 5.3.4
39 [^ʧ] { jo → io; # GB/T 17693.5-2009 表 1, 注 7
41 j } an $not_vowel → i ; # GB/T 17693.5-2009 表 1, 注 8
42 # GB/T 17693.5-2009 表 1, 注 8 also says that <uai> should be treated as if
43 # it was <u> plus <ai>. This is not borne out by the observed data, which
44 # suggests that <ua> plus <i> is the more appropriate choice in some
46 [g.$] { wai\u032F → wai ;
47 wai\u032F → uai\u032F ;
48 [g.$] { wau\u032F → wau ;
49 wau\u032F → uau\u032F ;
50 jau\u032F → iau\u032F ;
51 # Even though "ao" is not a diphthong in Spanish, Mandarin treats it as one.
52 [^jw] { ao } [^n] → au\u032F ;
53 [^jw] { ao } n $vowel → au\u032F ;
54 # Main pass: Phoneme to Hanzi conversion.
55 # This generally follows GB/T 17693.5-2009 表 1, unless otherwise noted.
63 ban } $not_vowel → 班 ;
66 ben } $not_vowel → 本 ;
68 bin } $not_vowel → 宾 ;
71 bjen } $not_vowel → 比恩 ;
74 bon } $not_vowel → 邦 ;
76 bun } $not_vowel → 本 ;
78 bwan } $not_vowel → 布安 ;
80 bwen } $not_vowel → 布恩 ; # Should be be 本, per GB/T 17693.5-2009 表 1.
82 bwin } $not_vowel → 布因 ; # Nonstandard, but fits observed data.
87 βan } $not_vowel → 万 ;
90 βen } $not_vowel → 文 ;
92 βin } $not_vowel → 温 ;
95 βjen } $not_vowel → 维恩 ;
98 βon } $not_vowel → 翁 ;
100 βun } $not_vowel → 文 ;
102 βwan } $not_vowel → 万 ;
104 βwen } $not_vowel → 文 ;
110 dan } $not_vowel → 丹 ;
114 den } $not_vowel → 登 ;
116 din } $not_vowel → 丁 ;
119 djen } $not_vowel → 迪恩 ;
122 don } $not_vowel → 东 ;
124 dun } $not_vowel → 敦 ;
126 dwan } $not_vowel → 端 ;
128 dwen } $not_vowel → 敦 ;
132 d } $word_boundary → ;
135 en } $not_vowel → 恩 ;
139 fan } $not_vowel → 凡 ;
143 fin } $not_vowel → 芬 ;
146 fjen } $not_vowel → 菲恩 ;
149 fon } $not_vowel → 丰 ;
151 fun } $not_vowel → 丰 ;
153 fwan } $not_vowel → 富安 ;
155 fwen } $not_vowel → 丰 ;
159 # The choice of 弗 vs. 夫 sounds simple according to the GB/T standard, but the
160 # data suggest otherwise. Ideally, 弗 should occur at the beginning of a
161 # morpheme (e.g. in "villafranca" 比利亚弗兰卡) and 夫 everywhere else. Since
162 # we don't have morpheme boundaries, we'll fudge it by writing 夫 at the end of
163 # a word and 弗 everywhere else.
164 f } $word_boundary → 夫 ;
167 gan } $not_vowel → 甘 ;
171 gen } $not_vowel → 根 ;
173 gin } $not_vowel → 金 ;
176 gjen } $not_vowel → 吉恩 ;
179 gon } $not_vowel → 贡 ;
181 gun } $not_vowel → 贡 ;
183 gwan } [$] → 古安 ; # Nonstandard, but fits observed data.
184 gwan } $not_vowel → 关 ;
186 gwen } $not_vowel → 古恩 ;
191 in } $not_vowel → 因 ;
194 ʝan } $not_vowel → 扬 ;
197 ʝen } $not_vowel → 延 ;
199 ʝin } $not_vowel → 因 ;
201 ʝon } $not_vowel → 永 ;
203 ʝun } $not_vowel → 云 ;
205 ʝwan } $not_vowel → 元 ;
207 ʝwen } $not_vowel → 云 ;
213 kan } $not_vowel → 坎 ;
217 ken } $not_vowel → 肯 ;
219 kin } $not_vowel → 金 ;
222 kjen } $not_vowel → 基恩 ;
225 kon } $not_vowel → 孔 ;
227 kun } $not_vowel → 昆 ;
229 kwan } $not_vowel → 宽 ;
231 kwen } $not_vowel → 昆 ;
233 kwin } $not_vowel → 昆 ;
239 lan } $not_vowel → 兰 ;
242 len } $not_vowel → 伦 ;
244 lin } $not_vowel → 林 ;
247 ljen } $not_vowel → 连 ;
250 lon } $not_vowel → 隆 ;
252 lun } $not_vowel → 伦 ;
254 lwan } $not_vowel → 卢安 ;
256 lwen } $not_vowel → 伦 ;
261 ʎan } $not_vowel → 良 ;
264 ʎen } $not_vowel → 连 ;
266 ʎin } $not_vowel → 林 ;
268 ʎon } $not_vowel → 利翁 ;
271 ʎwan } $not_vowel → 柳安 ;
273 ʎwen } $not_vowel → 柳恩 ;
279 man } $not_vowel → 曼 ;
283 men } $not_vowel → 门 ;
285 min } $not_vowel → 明 ;
288 mjen } $not_vowel → 缅 ;
291 mon } $not_vowel → 蒙 ;
293 mun } $not_vowel → 蒙 ;
295 mwan } $not_vowel → 穆安 ;
297 mwen } $not_vowel → 门 ;
299 mwin } $not_vowel → 穆因 ; # Nonstandard, but fits observed data.
304 nan } $not_vowel → 南 ;
307 nen } $not_vowel → 嫩 ;
309 nin } $not_vowel → 宁 ;
312 njen } $not_vowel → 年 ;
315 non } $not_vowel → 农 ;
317 nun } $not_vowel → 嫩 ;
319 nwan } $not_vowel → 努安 ;
321 nwen } $not_vowel → 农 ;
326 ɲan } $not_vowel → 尼扬 ;
329 ɲen } $not_vowel → 年 ;
331 ɲin } $not_vowel → 宁 ;
333 ɲon } $not_vowel → 尼翁 ;
336 ɲwan } $not_vowel → 纽安 ;
338 ɲwen } $not_vowel → 纽恩 ;
342 on } $not_vowel → 翁 ;
346 pan } $not_vowel → 潘 ;
349 pen } $not_vowel → 彭 ;
351 pin } $not_vowel → 平 ;
354 pjen } $not_vowel → 皮恩 ;
357 pon } $not_vowel → 蓬 ;
359 pun } $not_vowel → 蓬 ;
361 pwan } $not_vowel → 普安 ;
363 pwen } $not_vowel → 蓬 ;
369 ran } $not_vowel → 兰 ;
372 ren } $not_vowel → 伦 ;
374 rin } $not_vowel → 林 ;
377 rjen } $not_vowel → 连 ;
380 ron } $not_vowel → 龙 ;
382 run } $not_vowel → 伦 ;
384 rwan } $not_vowel → 鲁安 ;
386 rwen } $not_vowel → 伦 ;
392 san } $not_vowel → 桑 ;
395 sen } $not_vowel → 森 ;
397 sin } $not_vowel → 辛 ;
400 sjen } $not_vowel → 先 ;
403 son } $not_vowel → 松 ;
405 sun } $not_vowel → 孙 ;
407 swan } $not_vowel → 苏安 ;
409 swen } $not_vowel → 孙 ;
415 tan } $not_vowel → 坦 ;
419 ten } $not_vowel → 滕 ;
421 tin } $not_vowel → 廷 ;
424 tjen } $not_vowel → 蒂恩 ;
427 ton } $not_vowel → 通 ;
429 # The rules for /ts/ (tz in the orthography) are nonstandard and derived
430 # entirely from the observed data. They apply mostly to native toponyms
433 tsen } $not_vowel → 岑 ;
435 tsin } $not_vowel → 钦 ;
438 tsun } $not_vowel → 聪 ;
441 tun } $not_vowel → 通 ;
443 twan } $not_vowel → 图安 ;
445 twen } $not_vowel → 通 ;
451 ʧan } $not_vowel → 钱 ;
454 ʧen } $not_vowel → 琴 ;
456 ʧin } $not_vowel → 钦 ;
458 ʧjan } $not_vowel → 钱 ;
460 ʧjen } $not_vowel → 钱 ;
462 ʧjon } $not_vowel → 琼 ;
464 ʧon } $not_vowel → 琼 ;
466 ʧun } $not_vowel → 琼 ; # Should be 春, per GB/T 17693.5-2009 表 1.
468 ʧwan } $not_vowel → 丘安 ;
470 ʧwen } $not_vowel → 琼 ;
475 un } $not_vowel → 温 ;
477 wan } $not_vowel → 万 ;
479 wen } $not_vowel → 温 ;
481 win } $not_vowel → 温 ;
483 won } $not_vowel → 翁 ; # Unseen.
486 xan } $not_vowel → 汉 ;
490 xen } $not_vowel → 亨 ;
492 xin } $not_vowel → 欣 ;
495 xjen } $not_vowel → 希恩 ;
498 xon } $not_vowel → 洪 ;
500 xun } $not_vowel → 洪 ;
502 xwan } $not_vowel → 胡安 ;
504 xwen } $not_vowel → 洪 ;
509 # 尔 simplification pass. The idea is to drop most occurences of 尔
510 # corresponding to <r> (not to <l> or <ll>) from a word if there is another /l/
511 # sound nearby. There is a vague pattern like this in the data, but the details
512 # remain to be determined. At the moment, this does nothing, it just puts 尔 in
513 # for every <r> in a syllable coda.
523 # Dong-nan-xi-hai pass. Per GB/T 17693.5-2009 表 1, 注 4, replace confusing
524 # characters at the beginning and end of a word.
526 $word_boundary { 东 → 栋 ;
527 $word_boundary { 南 → 楠 ;
528 $word_boundary { 西 → 锡 ;
529 海 } $word_boundary → 亥 ;