icuSources/data/translit/Hiragana_Katakana.txt

   1 #--------------------------------------------------------------------
   2 # Copyright (c) 1999-2004, International Business Machines
   3 # Corporation and others. All Rights Reserved.
   4 #--------------------------------------------------------------------
   5
   6 # note: a global filter is more efficient, but MUST include all source chars
   7 :: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
   8 :: NFKC ();
   9
  10 # Hiragana-Katakana
  11
  12 # This is largely a one-to-one mapping, but it has a
  13 # few kinks:
  14
  15 # 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
  16 # Hiragana equivalents.  We use Hiragana wa/wi/we/wo
  17 # (308F-3092) with a voicing mark (3099), which is
  18 # semantically equivalent.  However, this is a non-
  19 # roundtripping transformation.
  20
  21 # 2. The Katakana small ka/ke (30F5,30F6) have no
  22 # Hiragana equiavlents.  We convert them to normal
  23 # Hiragana ka/ke (304B,3051).  This is a one-way
  24 # information-losing transformation and precludes
  25 # round-tripping of 30F5 and 30F6.
  26
  27 # 3. The combining marks 3099-309C are in the Hiragana
  28 # block, but they apply to Katakana as well, so we
  29 # leave them untouched.
  30
  31 # 4. The Katakana prolonged sound mark 30FC doubles the
  32 # preceding vowel.  This is a one-way information-
  33 # losing transformation from Katakana to Hiragana.
  34
  35 # 5. The Katakana middle dot separates words in foreign
  36 # expressions; we leave this unmodified.
  37
  38 # The above points preclude successful round-trip
  39 # transformations of arbitrary input text.  However,
  40 # they provide naturalistic results that should conform
  41 # to user expectations.
  42
  43
  44 # Combining equivalents va/vi/ve/vo
  45 わ゙ <> ヷ;
  46 ゐ゙ <> ヸ;
  47 ゑ゙ <> ヹ;
  48 を゙ <> ヺ;
  49
  50 # One-to-one mappings, main block
  51 # 3041:3094 <> 30A1:30F4
  52 # 309D,E <> 30FD,E
  53 ぁ <> ァ;
  54 あ <> ア;
  55 ぃ <> ィ;
  56 い <> イ;
  57 ぅ <> ゥ;
  58 う <> ウ;
  59 ぇ <> ェ;
  60 え <> エ;
  61 ぉ <> ォ;
  62 お <> オ;
  63 か <> カ;
  64 が <> ガ;
  65 き <> キ;
  66 ぎ <> ギ;
  67 く <> ク;
  68 ぐ <> グ;
  69 け <> ケ;
  70 げ <> ゲ;
  71 こ <> コ;
  72 ご <> ゴ;
  73 さ <> サ;
  74 ざ <> ザ;
  75 し <> シ;
  76 じ <> ジ;
  77 す <> ス;
  78 ず <> ズ;
  79 せ <> セ;
  80 ぜ <> ゼ;
  81 そ <> ソ;
  82 ぞ <> ゾ;
  83 た <> タ;
  84 だ <> ダ;
  85 ち <> チ;
  86 ぢ <> ヂ;
  87 っ <> ッ;
  88 つ <> ツ;
  89 づ <> ヅ;
  90 て <> テ;
  91 で <> デ;
  92 と <> ト;
  93 ど <> ド;
  94 な <> ナ;
  95 に <> ニ;
  96 ぬ <> ヌ;
  97 ね <> ネ;
  98 の <> ノ;
  99 は <> ハ;
 100 ば <> バ;
 101 ぱ <> パ;
 102 ひ <> ヒ;
 103 び <> ビ;
 104 ぴ <> ピ;
 105 ふ <> フ;
 106 ぶ <> ブ;
 107 ぷ <> プ;
 108 へ <> ヘ;
 109 べ <> ベ;
 110 ぺ <> ペ;
 111 ほ <> ホ;
 112 ぼ <> ボ;
 113 ぽ <> ポ;
 114 ま <> マ;
 115 み <> ミ;
 116 む <> ム;
 117 め <> メ;
 118 も <> モ;
 119 ゃ <> ャ;
 120 や <> ヤ;
 121 ゅ <> ュ;
 122 ゆ <> ユ;
 123 ょ <> ョ;
 124 よ <> ヨ;
 125 ら <> ラ;
 126 り <> リ;
 127 る <> ル;
 128 れ <> レ;
 129 ろ <> ロ;
 130 ゎ <> ヮ;
 131 わ <> ワ;
 132 ゐ <> ヰ;
 133 ゑ <> ヱ;
 134 を <> ヲ;
 135 ん <> ン;
 136 ゔ <> ヴ;
 137 ゝ <> ヽ;
 138 ゞ <> ヾ;
 139
 140 # One-way Katakana-Hiragana xform of small K ka/ke to
 141 # normal H ka/ke.
 142 か < ヵ;
 143 け < ヶ;
 144
 145 # Katakana followed by a prolonged sound mark 30FC has
 146 # its final vowel doubled.  This is a Katakana-Hiragana
 147 # one-way information-losing transformation.  We
 148 # include the small Katakana (e.g., small A 3041) and
 149 # do not distinguish them from their large
 150 # counterparts.  It doesn't make sense to double a
 151 # small counterpart vowel as a small Hiragana vowel, so
 152 # we don't do so.  In natural text this should never
 153 # occur anyway.  If a 30FC is seen without a preceding
 154 # vowel sound (e.g., after n 30F3) we do not change it.
 155
 156 ### $long = ー;
 157
 158 # The following categories are Hiragana, not Katakana
 159 # as might be expected, since by the time we get to the
 160 # 30FC, the preceding character will have already been
 161 # transformed to Hiragana.
 162
 163 # {The following mechanically generated from the
 164 # Unicode 3.0 data:}
 165
 166 $xa = [ \
 167 ぁ あ か が さ ざ \
 168 た だ な は ば ぱ \
 169 ま ゃ や ら ゎ わ \
 170 ];
 171
 172 $xi = [ \
 173 ぃ い き ぎ し じ \
 174 ち ぢ に ひ び ぴ \
 175 み り ゐ \
 176 ];
 177
 178 $xu = [ \
 179 ぅ う く ぐ す ず \
 180 っ つ づ ぬ ふ ぶ \
 181 ぷ む ゅ ゆ る ゔ \
 182 ];
 183
 184 $xe = [ \
 185 ぇ え け げ せ ぜ \
 186 て で ね へ べ ぺ \
 187 め れ ゑ \
 188 ];
 189
 190 $xo = [ \
 191 ぉ お こ ご そ ぞ \
 192 と ど の ほ ぼ ぽ \
 193 も ょ よ ろ を \
 194 ];
 195
 196 あ < $xa {ー};
 197 い < $xi {ー};
 198 う < $xu {ー};
 199 え < $xe {ー};
 200 お < $xo {ー};
 201
 202 :: (NFKC) ;
 203
 204 # note: a global filter is more efficient, but MUST include all source chars!!
 205 :: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);
 206
 207 # eof