git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/layout/KhmerReordering.h

Commit	Line	Data
	1	/*
	2	* (C) Copyright IBM Corp. 1998-2005 - All Rights Reserved
	3	*
	4	* This file is a modification of the ICU file IndicReordering.h
	5	* by Jens Herden and Javier Sola for Khmer language
	6	*
	7	*/
	8
	9	#ifndef __KHMERREORDERING_H
	10	#define __KHMERREORDERING_H
	11
	12	/**
	13	* \file
	14	* \internal
	15	*/
	16
	17	#include "LETypes.h"
	18	#include "OpenTypeTables.h"
	19
	20	U_NAMESPACE_BEGIN
	21
	22	class LEGlyphStorage;
	23
	24	// Vocabulary
	25	// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
	26	// center of the syllable, it can be souranded by coeng (subscript) consonants, vowels,
	27	// split vowels, signs... but there is only one base in a syllable, it has to be coded as
	28	// the first character of the syllable.
	29	// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
	30	// Khmer language has five of them. Khmer split vowels either have one part before the
	31	// base and one after the base or they have a part before the base and a part above the base.
	32	// The first part of all Khmer split vowels is the same character, identical to
	33	// the glyph of Khmer dependent vowel SRA EI
	34	// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
	35	// Differently than indian languages, the coeng modifies the consonant that follows it,
	36	// not the one preceding it Each consonant has two forms, the base form and the subscript form
	37	// the base form is the normal one (using the consonants code-point), the subscript form is
	38	// displayed when the combination coeng + consonant is encountered.
	39	// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
	40	// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
	41	// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
	42	// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
	43	// if it is attached to a consonant of the first series or a consonant of the second series
	44	// Most consonants have an equivalent in the other series, but some of theme exist only in
	45	// one series (for example SA). If we want to use the consonant SA with a vowel sound that
	46	// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
	47	// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
	48	// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
	49	// MUSIKATOAN a second series consonant to have a first series vowel sound.
	50	// Consonant shifter are both normally supercript marks, but, when they are followed by a
	51	// superscript, they change shape and take the form of subscript dependent vowel SRA U.
	52	// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
	53	// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
	54	// be placed after the coeng consonant.
	55	// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
	56	// Each vowel has its own position. Only one vowel per syllable is allowed.
	57	// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
	58	// Allowed in a syllable.
	59	//
	60	//
	61
	62	struct KhmerClassTable // This list must include all types of components that can be used inside a syllable
	63	{
	64	enum CharClassValues // order is important here! This order must be the same that is found in each horizontal
	65	// line in the statetable for Khmer (file KhmerReordering.cpp).
	66	{
	67	CC_RESERVED = 0,
	68	CC_CONSONANT = 1, // consonant of type 1 or independent vowel
	69	CC_CONSONANT2 = 2, // Consonant of type 2
	70	CC_CONSONANT3 = 3, // Consonant of type 3
	71	CC_ZERO_WIDTH_NJ_MARK = 4, // Zero Width non joiner character (0x200C)
	72	CC_CONSONANT_SHIFTER = 5,
	73	CC_ROBAT = 6, // Khmer special diacritic accent -treated differently in state table
	74	CC_COENG = 7, // Subscript consonant combining character
	75	CC_DEPENDENT_VOWEL = 8,
	76	CC_SIGN_ABOVE = 9,
	77	CC_SIGN_AFTER = 10,
	78	CC_ZERO_WIDTH_J_MARK = 11, // Zero width joiner character
	79	CC_COUNT = 12 // This is the number of character classes
	80	};
	81
	82	enum CharClassFlags
	83	{
	84	CF_CLASS_MASK = 0x0000FFFF,
	85
	86	CF_CONSONANT = 0x01000000, // flag to speed up comparing
	87	CF_SPLIT_VOWEL = 0x02000000, // flag for a split vowel -> the first part is added in front of the syllable
	88	CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable
	89	CF_COENG = 0x08000000, // flag to speed up comparing
	90	CF_SHIFTER = 0x10000000, // flag to speed up comparing
	91	CF_ABOVE_VOWEL = 0x20000000, // flag to speed up comparing
	92
	93	// position flags
	94	CF_POS_BEFORE = 0x00080000,
	95	CF_POS_BELOW = 0x00040000,
	96	CF_POS_ABOVE = 0x00020000,
	97	CF_POS_AFTER = 0x00010000,
	98	CF_POS_MASK = 0x000f0000
	99	};
	100
	101	typedef le_uint32 CharClass;
	102
	103	typedef le_int32 ScriptFlags;
	104
	105	LEUnicode firstChar; // for Khmer this will become x1780
	106	LEUnicode lastChar; // and this x17DF
	107	const CharClass *classTable;
	108
	109	CharClass getCharClass(LEUnicode ch) const;
	110
	111	static const KhmerClassTable *getKhmerClassTable();
	112	};
	113
	114
	115	class KhmerReordering /* not : public UObject because all methods are static */ {
	116	public:
	117	static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,
	118	LEUnicode *outChars, LEGlyphStorage &glyphStorage);
	119
	120	static const FeatureMap *getFeatureMap(le_int32 &count);
	121
	122	private:
	123	// do not instantiate
	124	KhmerReordering();
	125
	126	static le_int32 findSyllable(const KhmerClassTable classTable, const LEUnicode chars, le_int32 prev, le_int32 charCount);
	127
	128	};
	129
	130
	131	U_NAMESPACE_END
	132	#endif

1

/*

2

3

*

4

* This file is a modification of the ICU file IndicReordering.h

5

* by Jens Herden and Javier Sola for Khmer language

*

*/

#ifndef __KHMERREORDERING_H

10

#define __KHMERREORDERING_H

/**

* \file

* \internal

*/

#include "LETypes.h"

#include "OpenTypeTables.h"

U_NAMESPACE_BEGIN

class LEGlyphStorage;

23

24

// Vocabulary

25

// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the

26

// center of the syllable, it can be souranded by coeng (subscript) consonants, vowels,

27

// split vowels, signs... but there is only one base in a syllable, it has to be coded as

28

// the first character of the syllable.

29

// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).

30

// Khmer language has five of them. Khmer split vowels either have one part before the

31

// base and one after the base or they have a part before the base and a part above the base.

32

// The first part of all Khmer split vowels is the same character, identical to

33

// the glyph of Khmer dependent vowel SRA EI

34

// coeng --> modifier used in Khmer to construct coeng (subscript) consonants

35

// Differently than indian languages, the coeng modifies the consonant that follows it,

36

// not the one preceding it Each consonant has two forms, the base form and the subscript form

37

// the base form is the normal one (using the consonants code-point), the subscript form is

38

// displayed when the combination coeng + consonant is encountered.

39

// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant

40

// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)

41

// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)

42

// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds

43

// if it is attached to a consonant of the first series or a consonant of the second series

44

// Most consonants have an equivalent in the other series, but some of theme exist only in

45

// one series (for example SA). If we want to use the consonant SA with a vowel sound that

46

// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant

47

// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN

48

// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and

49

// MUSIKATOAN a second series consonant to have a first series vowel sound.

50

// Consonant shifter are both normally supercript marks, but, when they are followed by a

51

// superscript, they change shape and take the form of subscript dependent vowel SRA U.

52

// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they

53

// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should

54

// be placed after the coeng consonant.

55

// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base

56

// Each vowel has its own position. Only one vowel per syllable is allowed.

57

// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are

58

// Allowed in a syllable.

//

//

struct KhmerClassTable // This list must include all types of components that can be used inside a syllable

63

{

64

enum CharClassValues // order is important here! This order must be the same that is found in each horizontal

65

// line in the statetable for Khmer (file KhmerReordering.cpp).

66

{

67

CC_RESERVED = 0,

68

CC_CONSONANT = 1, // consonant of type 1 or independent vowel

69

CC_CONSONANT2 = 2, // Consonant of type 2

70

CC_CONSONANT3 = 3, // Consonant of type 3

71

CC_ZERO_WIDTH_NJ_MARK = 4, // Zero Width non joiner character (0x200C)

72

CC_CONSONANT_SHIFTER = 5,

73

CC_ROBAT = 6, // Khmer special diacritic accent -treated differently in state table

74

CC_COENG = 7, // Subscript consonant combining character

75

CC_DEPENDENT_VOWEL = 8,

76

CC_SIGN_ABOVE = 9,

77

CC_SIGN_AFTER = 10,

78

CC_ZERO_WIDTH_J_MARK = 11, // Zero width joiner character

79

CC_COUNT = 12 // This is the number of character classes

};

enum CharClassFlags

{

CF_CLASS_MASK = 0x0000FFFF,

85

86

CF_CONSONANT = 0x01000000, // flag to speed up comparing

87

CF_SPLIT_VOWEL = 0x02000000, // flag for a split vowel -> the first part is added in front of the syllable

88

CF_DOTTED_CIRCLE = 0x04000000, // add a dotted circle if a character with this flag is the first in a syllable

89

CF_COENG = 0x08000000, // flag to speed up comparing

90

CF_SHIFTER = 0x10000000, // flag to speed up comparing

91

CF_ABOVE_VOWEL = 0x20000000, // flag to speed up comparing

92

93

// position flags

94

CF_POS_BEFORE = 0x00080000,

95

CF_POS_BELOW = 0x00040000,

96

CF_POS_ABOVE = 0x00020000,

97

CF_POS_AFTER = 0x00010000,

98

CF_POS_MASK = 0x000f0000

99

};

100

101

typedef le_uint32 CharClass;

102

103

typedef le_int32 ScriptFlags;

104

105

LEUnicode firstChar; // for Khmer this will become x1780

106

LEUnicode lastChar; // and this x17DF

107

const CharClass *classTable;

108

109

CharClass getCharClass(LEUnicode ch) const;

110

111

static const KhmerClassTable *getKhmerClassTable();

};

class KhmerReordering /* not : public UObject because all methods are static */ {

116

public:

117

static le_int32 reorder(const LEUnicode *theChars, le_int32 charCount, le_int32 scriptCode,

118

LEUnicode *outChars, LEGlyphStorage &glyphStorage);

119

120

static const FeatureMap *getFeatureMap(le_int32 &count);

121

122

private:

123

// do not instantiate

124

KhmerReordering();

125

126

static le_int32 findSyllable(const KhmerClassTable *classTable, const LEUnicode *chars, le_int32 prev, le_int32 charCount);

};

U_NAMESPACE_END

#endif