| 1 | # *************************************************************************** |
| 2 | # * |
| 3 | # * Copyright (C) 2004-2016, International Business Machines |
| 4 | # * Corporation; Unicode, Inc.; and others. All Rights Reserved. |
| 5 | # * |
| 6 | # *************************************************************************** |
| 7 | # File: si_si_FONIPA.txt |
| 8 | # Generated from CLDR |
| 9 | # |
| 10 | |
| 11 | # Sinhala pronunciation rules |
| 12 | # |
| 13 | # Output |
| 14 | # k ɡ ŋ ᵑɡ c ɟ ɲ ʈ ɖ ⁿɖ t d n ⁿd p b m ᵐb j r l w ʃ s h f |
| 15 | # ə əː a aː æ æː i iː u uː e eː o oː |
| 16 | # |
| 17 | # References |
| 18 | # [1] Asanka Wasala, Ruvan Weerasinghe, and Kumudu Gamage: |
| 19 | # Sinhala Grapheme-to-Phoneme Conversion and Rules for Schwa Epenthesis. |
| 20 | # Proceedings of the COLING/ACL 2006 Main Conference Poster Sessions, |
| 21 | # pages 890–897. http://www.aclweb.org/anthology/P06-2114 |
| 22 | # Simplify ya + yansaya to plain ya after a consonant. |
| 23 | [\u0D9A-\u0DC6] \u0DCA (\u200D)? { ය\u0DCAය → ය; |
| 24 | # Delete ZWNJ and ZWJ to simplify further processing. |
| 25 | \u200C → ; |
| 26 | \u200D → ; |
| 27 | # Insert a schwa after every consonant that is not followed by a dependent vowel |
| 28 | # or virama. |
| 29 | ::Null; |
| 30 | ([\u0D9A-\u0DC6]) } [^\u0DCA-\u0DDF \u0DF2\u0DF3] → $1 ə; |
| 31 | # Pronunciation rules proper. |
| 32 | ::Null; |
| 33 | # fප is an alternative spelling of ෆ. |
| 34 | # This occurs e.g. in ඩේව\u0DD2ඩ\u0DCA කොපර\u0DCAfප\u0DD3ල\u0DCAඩ\u0DCA (David Copperfield) |
| 35 | # [see http://bradshawofthefuture.blogspot.com/2013/02/f.html]. |
| 36 | [Ff]ප → f; |
| 37 | # zස is seemingly the only way to unambiguously indicate a voiced /z/ sound. |
| 38 | # This occurs in e.g. ඇල\u0DCAzසය\u0DD2ම' රෝගය (Alzheimer's disease) |
| 39 | # [see https://si.wikipedia.org/wiki/ඇල\u0DCAzසය\u0DD2ම%27_රෝගය] |
| 40 | # or in zස\u0DD3බ\u0DCAරා (zebra) [see https://si.wikipedia.org/wiki/zස\u0DD3බ\u0DCAරා]. |
| 41 | [Zz]ස → z; |
| 42 | ං → ŋ; |
| 43 | o → ŋ; # common substitution for anusvaraya |
| 44 | ඃ ([\u0D9A-\u0DC6]) → | $1 \u0DCA $1; # TODO: check which consonants geminate |
| 45 | ඃ → h; |
| 46 | අ → a; |
| 47 | ආ → aː; |
| 48 | ඇ → æ; |
| 49 | ඈ → æː; |
| 50 | ඉ → i; |
| 51 | ඊ → iː; |
| 52 | උ → u; |
| 53 | ඌ → uː; |
| 54 | ඍ → ri; |
| 55 | ඎ → ruː; |
| 56 | ඏ → ilu; |
| 57 | ඐ → iluː; |
| 58 | එ → e; |
| 59 | ඒ → eː; |
| 60 | ඓ → aj; |
| 61 | ඔ → o; |
| 62 | ඕ → oː; |
| 63 | ඖ → aw; # TODO: check if this is correct |
| 64 | ක → k; |
| 65 | ඛ → k; |
| 66 | ග → ɡ; |
| 67 | ඝ → ɡ; |
| 68 | ඞ → ŋ; |
| 69 | ඟ → ᵑɡ; |
| 70 | ච → c; |
| 71 | ඡ → c; |
| 72 | ජ → ɟ; |
| 73 | ඣ → ɟ; |
| 74 | ඤ → ɲ; |
| 75 | ඥ → kɲ; # TODO: double-check |
| 76 | ඦ → ɟ; |
| 77 | ට → ʈ; |
| 78 | ඨ → ʈ; |
| 79 | ඩ → ɖ; |
| 80 | ඪ → ɖ; |
| 81 | ණ → n; |
| 82 | ඬ → ⁿɖ; |
| 83 | ත → t; |
| 84 | ථ → t; |
| 85 | ද → d; |
| 86 | ධ → d; |
| 87 | න → n; |
| 88 | ඳ → ⁿd; |
| 89 | ප → p; |
| 90 | ඵ → p; |
| 91 | බ → b; |
| 92 | භ → b; |
| 93 | ම → m; |
| 94 | ඹ → ᵐb; |
| 95 | ය → j; |
| 96 | ර → r; |
| 97 | ල → l; |
| 98 | ව → w; |
| 99 | ශ → ʃ; |
| 100 | ෂ → ʃ; |
| 101 | ස → s; |
| 102 | හ → h; |
| 103 | ළ → l; |
| 104 | ෆ → f; |
| 105 | \u0DCA → ; # delete virama |
| 106 | ා → aː; |
| 107 | ැ → æ; |
| 108 | ෑ → æː; |
| 109 | \u0DD2 → i; |
| 110 | \u0DD3 → iː; |
| 111 | \u0DD4 → u; |
| 112 | \u0DD6 → uː; |
| 113 | ෘ → ru; |
| 114 | ෙ → e; |
| 115 | ේ → eː; |
| 116 | ෛ → aj; |
| 117 | ො → o; |
| 118 | ෝ → oː; |
| 119 | ෞ → aw; # TODO: check if this is correct |
| 120 | ෟ → lu; |
| 121 | ෲ → ruː; |
| 122 | ෳ → luː; |
| 123 | # Heuristics for turning /ə/ into /a/. Based on [1]. |
| 124 | $c=[k ɡ ŋ {ᵑɡ} c ɟ ɲ ʈ ɖ {ⁿɖ} t d n {ⁿd} p b m {ᵐb} j r l w ʃ s z h f]; |
| 125 | $s=[:^L:]; |
| 126 | # Rule #1 |
| 127 | ::Null; |
| 128 | $s sv { ə → ə; # exception (a) |
| 129 | $s k { ə } r → ə; # exception (b) |
| 130 | $s $c { ə } $s → ə; # exception (c) |
| 131 | $s $c $c { ə → a; |
| 132 | $s $c { ə → a; |
| 133 | # Rule #2 |
| 134 | ::Null; |
| 135 | $c r { ə } $c → a; # clause (a) and (b) |
| 136 | $c r { a } h → a; # clause (d), exception |
| 137 | $c r { a } $c → ə; # clause (c) |
| 138 | # Rule #3 |
| 139 | # The paper is unclear about what this rule means. The interpretation here |
| 140 | # assumes that "preceded" in the paper is a typo and should be read "followed". |
| 141 | ::Null; |
| 142 | [a e æ o ə] h { ə → a; |
| 143 | # Rules #4 through #7 |
| 144 | ::Null; |
| 145 | ə } $c $c → a; # Rule #4 |
| 146 | ə } [rbɖʈ] $s → ə; # Rule #5 exception |
| 147 | ə } $c $s → a; # Rule #5 |
| 148 | ə } ji $s → a; # Rule #6 |
| 149 | k { ə } [rl] u → a; # Rule #7 |
| 150 | # Rule #8 |
| 151 | # Note that the paper doesn't say explicitly that this rule should be |
| 152 | # anchored at the beginning of a word, but the remarks before the rules |
| 153 | # seem to imply this. |
| 154 | ::Null; |
| 155 | $s k { a } l[aeo]ːj → ə; # Typo in paper: /j/ was /y/. |
| 156 | $s k { a } le[mh][ui] → ə; |
| 157 | $s k { alə } h[ui] → əle; |
| 158 | $s k { a } lə → ə; |
| 159 | # Diphthongs |
| 160 | ::Null; |
| 161 | www+ → ww; # යෞව\u0DCAවන |
| 162 | [i {iː} e {eː} æ {æː} o {oː} a {aː}] { wu → w; |
| 163 | əji → aj; |
| 164 | iji → iː; # perhaps: ij |
| 165 | [u {uː} e {eː} æ {æː} o {oː} a {aː}] { ji → j; |
| 166 | |