[apple/icu.git] / icuSources / layout / TibetanReordering.h

/*
 *
 * (C) Copyright IBM Corp. 1998-2016 - All Rights Reserved 
 *
 * Developed at DIT - Government of Bhutan
 *
 * Contact person: Pema Geyleg - <pema_geyleg@druknet.bt> 
 *
 * This file is a modification of the ICU file KhmerReordering.h
 * by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan
 * A first module for Dzongkha was developed by Karunakar under Panlocalisation funding.
 * Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola
 *
 */

#ifndef __TIBETANREORDERING_H
#define __TIBETANREORDERING_H

/**
 * \file
 * \internal
 */

#include "LETypes.h"
#include "OpenTypeTables.h"

U_NAMESPACE_BEGIN

class LEGlyphStorage;

// Vocabulary 
//     Base ->         A consonant in its full (not subscript) form. It is the 
//                     center of the syllable, it can be souranded by subjoined consonants, vowels,
//                     signs... but there is only one base in a stack, it has to be coded as
//                     the first character of the syllable.Included here are also groups of base + subjoined 
//										 which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take 
//                     subjoined consonants or other combining characters.											
//     Subjoined ->    Subjoined consonants and groups of subjoined consonants which have a single code-point 
//                     to repersent the group (even if each subjoined consonant is represented independently
//                     by anothe code-point
//     Tsa Phru -->    Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to 
// 										"normalization"							
//										 is placed after all the subjoined consonants, and it is also permitted there.
//     A Chung  Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels
//     Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned
//                     a given code-point (in spite of each single part of them having also a code-point
//                     They are avoided, and users are encouraged to use the combination of code-points that
//                     represents the same sound instead of using this combined characters. This is included here
//                     for compatibility with possible texts that use them (they are not in the Dzongkha keyboard).
//     Halanta ->      The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel, 
//                     in spite of not having other vowels present. It is usually placed immediatly after a base consonant,
//                     but in some special cases it can also be placed after a subjoined consonant, so this is also
//                     permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char)
//
//     Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There
//                     might be as much as three subjoined vowels in a given stack (only one in general text, but up 
//                     to three for abreviations, they have to be permitted).
//     Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three
//                     times. They can combine with subjoined vowels, and are always coded after these.
//     Anusvara -->    Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some
//                     special cases it can be placed before a vowel, so this is also permitted
//     Candrabindu ->  Forms of the Anusvara with different glyphs (and different in identity) which can be placed
//                     without vowel or after the vowel, but never before. Cannot combine with Anusvara.
//     Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining
//                     marks, so they have to be attached to a specific stack. The are using to emphasise a syllable.
//
//     Digits ->       Digits are not considered as non-combining characters because there are a few characters which
//                     combine with them, so they have to be considered independently.
//     Digit combining marks -> dependent marks that combine with digits.
//     
//     TODO
//     There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols
//     are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside
//     of the tibetan block, they have not been treated in this program.
    

struct TibetanClassTable    // This list must include all types of components that can be used inside a syllable
{
    enum CharClassValues  // order is important here! This order must be the same that is found in each horizontal 
                          // line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number
                          // to each type of character that has to be considered when analysing the order in which
                          // characters can be placed
    {
        CC_RESERVED             =  0, //Non Combining Characters
        CC_BASE                 =  1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks
        CC_SUBJOINED            =  2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point
        CC_TSA_PHRU             =  3, // Tsa-Phru character 0F39 
        CC_A_CHUNG              =  4, // Vowel Lenthening a-chung mark 0F71
        CC_COMP_SANSKRIT        =  5, // Precomposed Sanskrit vowels including Subjoined characters and vowels
        CC_HALANTA              =  6, // Halanta Character 0F84
        CC_BELOW_VOWEL          =  7, // Subjoined vowels
        CC_ABOVE_VOWEL          =  8, // Superscript vowels
        CC_ANUSVARA             =  9, // Tibetan sign Rjes Su Nga Ro 0F7E
        CC_CANDRABINDU          = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83
        CC_VISARGA              = 11, // Tibetan sign Rnam Bcad (0F7F)
        CC_ABOVE_S_MARK         = 12, // Stress Marks placed above the text
        CC_BELOW_S_MARK         = 13, // Stress Marks placed below the text
        CC_DIGIT                = 14, // Dzongkha Digits
        CC_PRE_DIGIT_MARK       = 15, // Mark placed before the digit
        CC_POST_BELOW_DIGIT_M   = 16, // Mark placed below or after the digit
        CC_COUNT                = 17  // This is the number of character classes
    };

    enum CharClassFlags
    {
        CF_CLASS_MASK    = 0x0000FFFF,

        CF_DOTTED_CIRCLE = 0x04000000,  // add a dotted circle if a character with this flag is the first in a syllable
        CF_DIGIT         = 0x01000000,  // flag to speed up comparaisson
        CF_PREDIGIT      = 0x02000000,  // flag to detect pre-digit marks for reordering

        // position flags
        CF_POS_BEFORE    = 0x00080000,
        CF_POS_BELOW     = 0x00040000,
        CF_POS_ABOVE     = 0x00020000,
        CF_POS_AFTER     = 0x00010000,
        CF_POS_MASK      = 0x000f0000
    };

    typedef le_uint32 CharClass;

    typedef le_int32 ScriptFlags;

    LEUnicode firstChar;   // for Tibetan this will become xOF00
    LEUnicode lastChar;    //  and this x0FFF
    const CharClass *classTable;

    CharClass getCharClass(LEUnicode ch) const;

    static const TibetanClassTable *getTibetanClassTable();
};


class TibetanReordering /* not : public UObject because all methods are static */ {
public:
    static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,
        LEUnicode *outChars, LEGlyphStorage &glyphStorage);

    static const FeatureMap *getFeatureMap(le_int32 &count);

private:
    // do not instantiate
    TibetanReordering();

    static le_int32 findSyllable(const TibetanClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount);

};


U_NAMESPACE_END
#endif
Commit	Line	Data
73c04bcf A	1	/*
73c04bcf A	2	*
2ca993e8	3	* (C) Copyright IBM Corp. 1998-2016 - All Rights Reserved
73c04bcf A	4	*
	5	* Developed at DIT - Government of Bhutan
	6	*
	7	* Contact person: Pema Geyleg - <pema_geyleg@druknet.bt>
	8	*
	9	* This file is a modification of the ICU file KhmerReordering.h
	10	* by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan
	11	* A first module for Dzongkha was developed by Karunakar under Panlocalisation funding.
	12	* Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola
	13	*
	14	*/
	15
	16	#ifndef __TIBETANREORDERING_H
57a6839d	17	#define __TIBETANREORDERING_H
73c04bcf A	18
	19	/**
	20	* \file
	21	* \internal
	22	*/
	23
2ca993e8 A	24	#include "LETypes.h"
2ca993e8 A	25	#include "OpenTypeTables.h"
73c04bcf A	26
	27	U_NAMESPACE_BEGIN
	28
	29	class LEGlyphStorage;
	30
	31	// Vocabulary
	32	// Base -> A consonant in its full (not subscript) form. It is the
	33	// center of the syllable, it can be souranded by subjoined consonants, vowels,
	34	// signs... but there is only one base in a stack, it has to be coded as
	35	// the first character of the syllable.Included here are also groups of base + subjoined
	36	// which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take
	37	// subjoined consonants or other combining characters.
	38	// Subjoined -> Subjoined consonants and groups of subjoined consonants which have a single code-point
	39	// to repersent the group (even if each subjoined consonant is represented independently
	40	// by anothe code-point
	41	// Tsa Phru --> Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to
	42	// "normalization"
	43	// is placed after all the subjoined consonants, and it is also permitted there.
	44	// A Chung Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels
	45	// Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned
	46	// a given code-point (in spite of each single part of them having also a code-point
	47	// They are avoided, and users are encouraged to use the combination of code-points that
	48	// represents the same sound instead of using this combined characters. This is included here
	49	// for compatibility with possible texts that use them (they are not in the Dzongkha keyboard).
	50	// Halanta -> The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel,
	51	// in spite of not having other vowels present. It is usually placed immediatly after a base consonant,
	52	// but in some special cases it can also be placed after a subjoined consonant, so this is also
	53	// permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char)
	54	//
	55	// Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There
	56	// might be as much as three subjoined vowels in a given stack (only one in general text, but up
	57	// to three for abreviations, they have to be permitted).
	58	// Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three
	59	// times. They can combine with subjoined vowels, and are always coded after these.
	60	// Anusvara --> Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some
	61	// special cases it can be placed before a vowel, so this is also permitted
	62	// Candrabindu -> Forms of the Anusvara with different glyphs (and different in identity) which can be placed
	63	// without vowel or after the vowel, but never before. Cannot combine with Anusvara.
	64	// Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining
	65	// marks, so they have to be attached to a specific stack. The are using to emphasise a syllable.
	66	//
	67	// Digits -> Digits are not considered as non-combining characters because there are a few characters which
	68	// combine with them, so they have to be considered independently.
	69	// Digit combining marks -> dependent marks that combine with digits.
	70	//
	71	// TODO
	72	// There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols
	73	// are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside
	74	// of the tibetan block, they have not been treated in this program.
	75
	76
	77	struct TibetanClassTable // This list must include all types of components that can be used inside a syllable
	78	{
	79	enum CharClassValues // order is important here! This order must be the same that is found in each horizontal
	80	// line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number
	81	// to each type of character that has to be considered when analysing the order in which
	82	// characters can be placed
	83	{
	84	CC_RESERVED = 0, //Non Combining Characters
	85	CC_BASE = 1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks
	86	CC_SUBJOINED = 2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point
	87	CC_TSA_PHRU = 3, // Tsa-Phru character 0F39
	88	CC_A_CHUNG = 4, // Vowel Lenthening a-chung mark 0F71
	89	CC_COMP_SANSKRIT = 5, // Precomposed Sanskrit vowels including Subjoined characters and vowels
90	CC_HALANTA = 6, // Halanta Character 0F84
91	CC_BELOW_VOWEL = 7, // Subjoined vowels
92	CC_ABOVE_VOWEL = 8, // Superscript vowels
93	CC_ANUSVARA = 9, // Tibetan sign Rjes Su Nga Ro 0F7E
94	CC_CANDRABINDU = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83
95	CC_VISARGA = 11, // Tibetan sign Rnam Bcad (0F7F)
96	CC_ABOVE_S_MARK = 12, // Stress Marks placed above the text
97	CC_BELOW_S_MARK = 13, // Stress Marks placed below the text
98	CC_DIGIT = 14, // Dzongkha Digits
99	CC_PRE_DIGIT_MARK = 15, // Mark placed before the digit
100	CC_POST_BELOW_DIGIT_M = 16, // Mark placed below or after the digit
101	CC_COUNT = 17 // This is the number of character classes
102	};
103
104	enum CharClassFlags
105	{
106	CF_CLASS_MASK = 0x0000FFFF,
107
108	CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable
109	CF_DIGIT = 0x01000000, // flag to speed up comparaisson
110	CF_PREDIGIT = 0x02000000, // flag to detect pre-digit marks for reordering
111
112	// position flags
113	CF_POS_BEFORE = 0x00080000,
114	CF_POS_BELOW = 0x00040000,
115	CF_POS_ABOVE = 0x00020000,
116	CF_POS_AFTER = 0x00010000,
117	CF_POS_MASK = 0x000f0000
118	};
119
120	typedef le_uint32 CharClass;
121
122	typedef le_int32 ScriptFlags;
123
124	LEUnicode firstChar; // for Tibetan this will become xOF00
125	LEUnicode lastChar; // and this x0FFF
126	const CharClass *classTable;
127
128	CharClass getCharClass(LEUnicode ch) const;
129
130	static const TibetanClassTable *getTibetanClassTable();
131	};
132
133
134	class TibetanReordering /* not : public UObject because all methods are static */ {
135	public:
136	static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,
137	LEUnicode *outChars, LEGlyphStorage &glyphStorage);
138
139	static const FeatureMap *getFeatureMap(le_int32 &count);
140
141	private:
142	// do not instantiate
143	TibetanReordering();
144
145	static le_int32 findSyllable(const TibetanClassTable classTable, const LEUnicode chars, le_int32 prev, le_int32 charCount);
146
147	};
148
149
150	U_NAMESPACE_END
151	#endif