git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/layout/TibetanReordering.h

Commit	Line	Data
	1	/*
	2	*
	3	* (C) Copyright IBM Corp. 1998-2013 - All Rights Reserved
	4	*
	5	* Developed at DIT - Government of Bhutan
	6	*
	7	* Contact person: Pema Geyleg - <pema_geyleg@druknet.bt>
	8	*
	9	* This file is a modification of the ICU file KhmerReordering.h
	10	* by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan
	11	* A first module for Dzongkha was developed by Karunakar under Panlocalisation funding.
	12	* Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola
	13	*
	14	*/
	15
	16	#ifndef __TIBETANREORDERING_H
	17	#define __TIBETANREORDERING_H
	18
	19	/**
	20	* \file
	21	* \internal
	22	*/
	23
	24	// #include "LETypes.h"
	25	// #include "OpenTypeTables.h"
	26
	27	U_NAMESPACE_BEGIN
	28
	29	class LEGlyphStorage;
	30
	31	// Vocabulary
	32	// Base -> A consonant in its full (not subscript) form. It is the
	33	// center of the syllable, it can be souranded by subjoined consonants, vowels,
	34	// signs... but there is only one base in a stack, it has to be coded as
	35	// the first character of the syllable.Included here are also groups of base + subjoined
	36	// which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take
	37	// subjoined consonants or other combining characters.
	38	// Subjoined -> Subjoined consonants and groups of subjoined consonants which have a single code-point
	39	// to repersent the group (even if each subjoined consonant is represented independently
	40	// by anothe code-point
	41	// Tsa Phru --> Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to
	42	// "normalization"
	43	// is placed after all the subjoined consonants, and it is also permitted there.
	44	// A Chung Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels
	45	// Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned
	46	// a given code-point (in spite of each single part of them having also a code-point
	47	// They are avoided, and users are encouraged to use the combination of code-points that
	48	// represents the same sound instead of using this combined characters. This is included here
	49	// for compatibility with possible texts that use them (they are not in the Dzongkha keyboard).
	50	// Halanta -> The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel,
	51	// in spite of not having other vowels present. It is usually placed immediatly after a base consonant,
	52	// but in some special cases it can also be placed after a subjoined consonant, so this is also
	53	// permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char)
	54	//
	55	// Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There
	56	// might be as much as three subjoined vowels in a given stack (only one in general text, but up
	57	// to three for abreviations, they have to be permitted).
	58	// Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three
	59	// times. They can combine with subjoined vowels, and are always coded after these.
	60	// Anusvara --> Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some
	61	// special cases it can be placed before a vowel, so this is also permitted
	62	// Candrabindu -> Forms of the Anusvara with different glyphs (and different in identity) which can be placed
	63	// without vowel or after the vowel, but never before. Cannot combine with Anusvara.
	64	// Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining
	65	// marks, so they have to be attached to a specific stack. The are using to emphasise a syllable.
	66	//
	67	// Digits -> Digits are not considered as non-combining characters because there are a few characters which
	68	// combine with them, so they have to be considered independently.
	69	// Digit combining marks -> dependent marks that combine with digits.
	70	//
	71	// TODO
	72	// There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols
	73	// are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside
	74	// of the tibetan block, they have not been treated in this program.
	75
	76
	77	struct TibetanClassTable // This list must include all types of components that can be used inside a syllable
	78	{
	79	enum CharClassValues // order is important here! This order must be the same that is found in each horizontal
	80	// line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number
	81	// to each type of character that has to be considered when analysing the order in which
	82	// characters can be placed
	83	{
	84	CC_RESERVED = 0, //Non Combining Characters
	85	CC_BASE = 1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks
	86	CC_SUBJOINED = 2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point
	87	CC_TSA_PHRU = 3, // Tsa-Phru character 0F39
	88	CC_A_CHUNG = 4, // Vowel Lenthening a-chung mark 0F71
	89	CC_COMP_SANSKRIT = 5, // Precomposed Sanskrit vowels including Subjoined characters and vowels
	90	CC_HALANTA = 6, // Halanta Character 0F84
	91	CC_BELOW_VOWEL = 7, // Subjoined vowels
	92	CC_ABOVE_VOWEL = 8, // Superscript vowels
	93	CC_ANUSVARA = 9, // Tibetan sign Rjes Su Nga Ro 0F7E
	94	CC_CANDRABINDU = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83
	95	CC_VISARGA = 11, // Tibetan sign Rnam Bcad (0F7F)
	96	CC_ABOVE_S_MARK = 12, // Stress Marks placed above the text
	97	CC_BELOW_S_MARK = 13, // Stress Marks placed below the text
	98	CC_DIGIT = 14, // Dzongkha Digits
	99	CC_PRE_DIGIT_MARK = 15, // Mark placed before the digit
	100	CC_POST_BELOW_DIGIT_M = 16, // Mark placed below or after the digit
	101	CC_COUNT = 17 // This is the number of character classes
	102	};
	103
	104	enum CharClassFlags
	105	{
	106	CF_CLASS_MASK = 0x0000FFFF,
	107
	108	CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable
	109	CF_DIGIT = 0x01000000, // flag to speed up comparaisson
	110	CF_PREDIGIT = 0x02000000, // flag to detect pre-digit marks for reordering
	111
	112	// position flags
	113	CF_POS_BEFORE = 0x00080000,
	114	CF_POS_BELOW = 0x00040000,
	115	CF_POS_ABOVE = 0x00020000,
	116	CF_POS_AFTER = 0x00010000,
	117	CF_POS_MASK = 0x000f0000
	118	};
	119
	120	typedef le_uint32 CharClass;
	121
	122	typedef le_int32 ScriptFlags;
	123
	124	LEUnicode firstChar; // for Tibetan this will become xOF00
	125	LEUnicode lastChar; // and this x0FFF
	126	const CharClass *classTable;
	127
	128	CharClass getCharClass(LEUnicode ch) const;
	129
	130	static const TibetanClassTable *getTibetanClassTable();
	131	};
	132
	133
	134	class TibetanReordering /* not : public UObject because all methods are static */ {
	135	public:
	136	static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,
	137	LEUnicode *outChars, LEGlyphStorage &glyphStorage);
	138
	139	static const FeatureMap *getFeatureMap(le_int32 &count);
	140
	141	private:
	142	// do not instantiate
	143	TibetanReordering();
	144
	145	static le_int32 findSyllable(const TibetanClassTable classTable, const LEUnicode chars, le_int32 prev, le_int32 charCount);
	146
	147	};
	148
	149
	150	U_NAMESPACE_END
	151	#endif

1

/*

*

*

* Developed at DIT - Government of Bhutan

6

*

7

* Contact person: Pema Geyleg - <pema_geyleg@druknet.bt>

8

*

9

* This file is a modification of the ICU file KhmerReordering.h

10

* by Jens Herden and Javier Sola who have given all their possible rights to IBM and the Governement of Bhutan

11

* A first module for Dzongkha was developed by Karunakar under Panlocalisation funding.

12

* Assistance for this module has been received from Namgay Thinley, Christopher Fynn and Javier Sola

*

*/

#ifndef __TIBETANREORDERING_H

17

#define __TIBETANREORDERING_H

/**

* \file

* \internal

*/

// #include "LETypes.h"

25

// #include "OpenTypeTables.h"

U_NAMESPACE_BEGIN

class LEGlyphStorage;

30

31

// Vocabulary

32

// Base -> A consonant in its full (not subscript) form. It is the

33

// center of the syllable, it can be souranded by subjoined consonants, vowels,

34

// signs... but there is only one base in a stack, it has to be coded as

35

// the first character of the syllable.Included here are also groups of base + subjoined

36

// which are represented by one single code point in unicode (e.g. 0F43) Also other characters that might take

37

// subjoined consonants or other combining characters.

38

// Subjoined -> Subjoined consonants and groups of subjoined consonants which have a single code-point

39

// to repersent the group (even if each subjoined consonant is represented independently

40

// by anothe code-point

41

// Tsa Phru --> Tsa Phru character, Bhutanese people will always place it right after the base, but sometimes, due to

42

// "normalization"

43

// is placed after all the subjoined consonants, and it is also permitted there.

44

// A Chung Vowel lengthening mark --> . 0F71 It is placed after the base and any subjoined consonants but before any vowels

45

// Precomposed Sanskrit vowels --> The are combinations of subjoined consonants + vowels that have been assigned

46

// a given code-point (in spite of each single part of them having also a code-point

47

// They are avoided, and users are encouraged to use the combination of code-points that

48

// represents the same sound instead of using this combined characters. This is included here

49

// for compatibility with possible texts that use them (they are not in the Dzongkha keyboard).

50

// Halanta -> The Halanta or Virama character 0F84 indicates that a consonant should not use its inheernt vowel,

51

// in spite of not having other vowels present. It is usually placed immediatly after a base consonant,

52

// but in some special cases it can also be placed after a subjoined consonant, so this is also

53

// permitted in this algorithm. (Halanta is always displayed in Tibetan not used as a connecting char)

54

//

55

// Subjoined vowels -> Dependent vowels (matras) placed below the base and below all subjoined consonants. There

56

// might be as much as three subjoined vowels in a given stack (only one in general text, but up

57

// to three for abreviations, they have to be permitted).

58

// Superscript vowels -> There are three superscript vowels, and they can be repeated or combined (up to three

59

// times. They can combine with subjoined vowels, and are always coded after these.

60

// Anusvara --> Nasalisation sign. Traditioinally placed in absence of vowels, but also after vowels. In some

61

// special cases it can be placed before a vowel, so this is also permitted

62

// Candrabindu -> Forms of the Anusvara with different glyphs (and different in identity) which can be placed

63

// without vowel or after the vowel, but never before. Cannot combine with Anusvara.

64

// Stress marks -> Marks placed above or below a syllable, affecting the whole syllable. They are combining

65

// marks, so they have to be attached to a specific stack. The are using to emphasise a syllable.

66

//

67

// Digits -> Digits are not considered as non-combining characters because there are a few characters which

68

// combine with them, so they have to be considered independently.

69

// Digit combining marks -> dependent marks that combine with digits.

70

//

71

// TODO

72

// There are a number of characters in the CJK block that are used in Tibetan script, two of these are symbols

73

// are used as bases for combining glyphs, and have not been encoded in Tibetan. As these characters are outside

74

// of the tibetan block, they have not been treated in this program.

75

76

77

struct TibetanClassTable // This list must include all types of components that can be used inside a syllable

78

{

79

enum CharClassValues // order is important here! This order must be the same that is found in each horizontal

80

// line in the statetable for Tibetan (file TibetanReordering.cpp). It assigns one number

81

// to each type of character that has to be considered when analysing the order in which

82

// characters can be placed

83

{

84

CC_RESERVED = 0, //Non Combining Characters

85

CC_BASE = 1, // Base Consonants, Base Consonants with Subjoined attached in code point, Sanskrit base marks

86

CC_SUBJOINED = 2, // Subjoined Consonats, combination of more than Subjoined Consonants in the code point

87

CC_TSA_PHRU = 3, // Tsa-Phru character 0F39

88

CC_A_CHUNG = 4, // Vowel Lenthening a-chung mark 0F71

89

CC_COMP_SANSKRIT = 5, // Precomposed Sanskrit vowels including Subjoined characters and vowels

90

CC_HALANTA = 6, // Halanta Character 0F84

91

CC_BELOW_VOWEL = 7, // Subjoined vowels

92

CC_ABOVE_VOWEL = 8, // Superscript vowels

93

CC_ANUSVARA = 9, // Tibetan sign Rjes Su Nga Ro 0F7E

94

CC_CANDRABINDU = 10, // Tibetan sign Sna Ldan and Nyi Zla Naa Da 0F82, 0F83

95

CC_VISARGA = 11, // Tibetan sign Rnam Bcad (0F7F)

96

CC_ABOVE_S_MARK = 12, // Stress Marks placed above the text

97

CC_BELOW_S_MARK = 13, // Stress Marks placed below the text

98

CC_DIGIT = 14, // Dzongkha Digits

99

CC_PRE_DIGIT_MARK = 15, // Mark placed before the digit

100

CC_POST_BELOW_DIGIT_M = 16, // Mark placed below or after the digit

101

CC_COUNT = 17 // This is the number of character classes

};

enum CharClassFlags

{

CF_CLASS_MASK = 0x0000FFFF,

107

108

CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable

109

CF_DIGIT = 0x01000000, // flag to speed up comparaisson

110

CF_PREDIGIT = 0x02000000, // flag to detect pre-digit marks for reordering

111

112

// position flags

113

CF_POS_BEFORE = 0x00080000,

114

CF_POS_BELOW = 0x00040000,

115

CF_POS_ABOVE = 0x00020000,

116

CF_POS_AFTER = 0x00010000,

117

CF_POS_MASK = 0x000f0000

118

};

119

120

typedef le_uint32 CharClass;

121

122

typedef le_int32 ScriptFlags;

123

124

LEUnicode firstChar; // for Tibetan this will become xOF00

125

LEUnicode lastChar; // and this x0FFF

126

const CharClass *classTable;

127

128

CharClass getCharClass(LEUnicode ch) const;

129

130

static const TibetanClassTable *getTibetanClassTable();

};

class TibetanReordering /* not : public UObject because all methods are static */ {

135

public:

136

static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,

137

LEUnicode *outChars, LEGlyphStorage &glyphStorage);

138

139

static const FeatureMap *getFeatureMap(le_int32 &count);

140

141

private:

142

// do not instantiate

143

TibetanReordering();

144

145

static le_int32 findSyllable(const TibetanClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount);

};

U_NAMESPACE_END

#endif