| 1 | /* |
| 2 | * |
| 3 | * (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved |
| 4 | * |
| 5 | * Developed at DIT - Government of Bhutan |
| 6 | * |
| 7 | * Contact person: Pema Geyleg - <pema_geyleg@druknet.bt> |
| 8 | * |
| 9 | * This file is a modification of the ICU file KhmerReordering.h |
| 10 | * by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan |
| 11 | * A first module for Dzongkha was developed by Karunakar under Panlocalisation funding. |
| 12 | * Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola |
| 13 | * |
| 14 | */ |
| 15 | |
| 16 | #ifndef __TIBETANREORDERING_H |
| 17 | #define __TIBETANORDERING_H |
| 18 | |
| 19 | /** |
| 20 | * \file |
| 21 | * \internal |
| 22 | */ |
| 23 | |
| 24 | // #include "LETypes.h" |
| 25 | // #include "OpenTypeTables.h" |
| 26 | |
| 27 | U_NAMESPACE_BEGIN |
| 28 | |
| 29 | class LEGlyphStorage; |
| 30 | |
| 31 | // Vocabulary |
| 32 | // Base -> A consonant in its full (not subscript) form. It is the |
| 33 | // center of the syllable, it can be souranded by subjoined consonants, vowels, |
| 34 | // signs... but there is only one base in a stack, it has to be coded as |
| 35 | // the first character of the syllable.Included here are also groups of base + subjoined |
| 36 | // which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take |
| 37 | // subjoined consonants or other combining characters. |
| 38 | // Subjoined -> Subjoined consonants and groups of subjoined consonants which have a single code-point |
| 39 | // to repersent the group (even if each subjoined consonant is represented independently |
| 40 | // by anothe code-point |
| 41 | // Tsa Phru --> Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to |
| 42 | // "normalization" |
| 43 | // is placed after all the subjoined consonants, and it is also permitted there. |
| 44 | // A Chung Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels |
| 45 | // Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned |
| 46 | // a given code-point (in spite of each single part of them having also a code-point |
| 47 | // They are avoided, and users are encouraged to use the combination of code-points that |
| 48 | // represents the same sound instead of using this combined characters. This is included here |
| 49 | // for compatibility with possible texts that use them (they are not in the Dzongkha keyboard). |
| 50 | // Halanta -> The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel, |
| 51 | // in spite of not having other vowels present. It is usually placed immediatly after a base consonant, |
| 52 | // but in some special cases it can also be placed after a subjoined consonant, so this is also |
| 53 | // permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char) |
| 54 | // |
| 55 | // Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There |
| 56 | // might be as much as three subjoined vowels in a given stack (only one in general text, but up |
| 57 | // to three for abreviations, they have to be permitted). |
| 58 | // Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three |
| 59 | // times. They can combine with subjoined vowels, and are always coded after these. |
| 60 | // Anusvara --> Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some |
| 61 | // special cases it can be placed before a vowel, so this is also permitted |
| 62 | // Candrabindu -> Forms of the Anusvara with different glyphs (and different in identity) which can be placed |
| 63 | // without vowel or after the vowel, but never before. Cannot combine with Anusvara. |
| 64 | // Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining |
| 65 | // marks, so they have to be attached to a specific stack. The are using to emphasise a syllable. |
| 66 | // |
| 67 | // Digits -> Digits are not considered as non-combining characters because there are a few characters which |
| 68 | // combine with them, so they have to be considered independently. |
| 69 | // Digit combining marks -> dependent marks that combine with digits. |
| 70 | // |
| 71 | // TODO |
| 72 | // There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols |
| 73 | // are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside |
| 74 | // of the tibetan block, they have not been treated in this program. |
| 75 | |
| 76 | |
| 77 | struct TibetanClassTable // This list must include all types of components that can be used inside a syllable |
| 78 | { |
| 79 | enum CharClassValues // order is important here! This order must be the same that is found in each horizontal |
| 80 | // line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number |
| 81 | // to each type of character that has to be considered when analysing the order in which |
| 82 | // characters can be placed |
| 83 | { |
| 84 | CC_RESERVED = 0, //Non Combining Characters |
| 85 | CC_BASE = 1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks |
| 86 | CC_SUBJOINED = 2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point |
| 87 | CC_TSA_PHRU = 3, // Tsa-Phru character 0F39 |
| 88 | CC_A_CHUNG = 4, // Vowel Lenthening a-chung mark 0F71 |
| 89 | CC_COMP_SANSKRIT = 5, // Precomposed Sanskrit vowels including Subjoined characters and vowels |
| 90 | CC_HALANTA = 6, // Halanta Character 0F84 |
| 91 | CC_BELOW_VOWEL = 7, // Subjoined vowels |
| 92 | CC_ABOVE_VOWEL = 8, // Superscript vowels |
| 93 | CC_ANUSVARA = 9, // Tibetan sign Rjes Su Nga Ro 0F7E |
| 94 | CC_CANDRABINDU = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83 |
| 95 | CC_VISARGA = 11, // Tibetan sign Rnam Bcad (0F7F) |
| 96 | CC_ABOVE_S_MARK = 12, // Stress Marks placed above the text |
| 97 | CC_BELOW_S_MARK = 13, // Stress Marks placed below the text |
| 98 | CC_DIGIT = 14, // Dzongkha Digits |
| 99 | CC_PRE_DIGIT_MARK = 15, // Mark placed before the digit |
| 100 | CC_POST_BELOW_DIGIT_M = 16, // Mark placed below or after the digit |
| 101 | CC_COUNT = 17 // This is the number of character classes |
| 102 | }; |
| 103 | |
| 104 | enum CharClassFlags |
| 105 | { |
| 106 | CF_CLASS_MASK = 0x0000FFFF, |
| 107 | |
| 108 | CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable |
| 109 | CF_DIGIT = 0x01000000, // flag to speed up comparaisson |
| 110 | CF_PREDIGIT = 0x02000000, // flag to detect pre-digit marks for reordering |
| 111 | |
| 112 | // position flags |
| 113 | CF_POS_BEFORE = 0x00080000, |
| 114 | CF_POS_BELOW = 0x00040000, |
| 115 | CF_POS_ABOVE = 0x00020000, |
| 116 | CF_POS_AFTER = 0x00010000, |
| 117 | CF_POS_MASK = 0x000f0000 |
| 118 | }; |
| 119 | |
| 120 | typedef le_uint32 CharClass; |
| 121 | |
| 122 | typedef le_int32 ScriptFlags; |
| 123 | |
| 124 | LEUnicode firstChar; // for Tibetan this will become xOF00 |
| 125 | LEUnicode lastChar; // and this x0FFF |
| 126 | const CharClass *classTable; |
| 127 | |
| 128 | CharClass getCharClass(LEUnicode ch) const; |
| 129 | |
| 130 | static const TibetanClassTable *getTibetanClassTable(); |
| 131 | }; |
| 132 | |
| 133 | |
| 134 | class TibetanReordering /* not : public UObject because all methods are static */ { |
| 135 | public: |
| 136 | static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode, |
| 137 | LEUnicode *outChars, LEGlyphStorage &glyphStorage); |
| 138 | |
| 139 | static const FeatureMap *getFeatureMap(le_int32 &count); |
| 140 | |
| 141 | private: |
| 142 | // do not instantiate |
| 143 | TibetanReordering(); |
| 144 | |
| 145 | static le_int32 findSyllable(const TibetanClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount); |
| 146 | |
| 147 | }; |
| 148 | |
| 149 | |
| 150 | U_NAMESPACE_END |
| 151 | #endif |