]>
git.saurik.com Git - apple/javascriptcore.git/blob - yarr/YarrCanonicalizeUCS2.js
2 * Copyright (C) 2012 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 // See ES 5.1, 15.10.2.8
27 function canonicalize(ch
)
29 var u
= String
.fromCharCode(ch
).toUpperCase();
32 var cu
= u
.charCodeAt(0);
33 if (ch
>= 128 && cu
< 128)
38 var MAX_UCS2
= 0xFFFF;
41 var groupedCanonically
= [];
42 // Pass 1: populate groupedCanonically - this is mapping from canonicalized
43 // values back to the set of character code that canonicalize to them.
44 for (var i
= 0; i
<= MAX_UCS2
; ++i
) {
45 var ch
= canonicalize(i
);
46 if (!groupedCanonically
[ch
])
47 groupedCanonically
[ch
] = [];
48 groupedCanonically
[ch
].push(i
);
52 var latinTypeInfo
= [];
53 var characterSetInfo
= [];
54 // Pass 2: populate typeInfo & characterSetInfo. For every character calculate
55 // a typeInfo value, described by the types above, and a value payload.
56 for (cu
in groupedCanonically
) {
57 // The set of characters that canonicalize to cu
58 var characters
= groupedCanonically
[cu
];
60 // If there is only one, it is unique.
61 if (characters
.length
== 1) {
62 typeInfo
[characters
[0]] = "CanonicalizeUnique:0";
63 latinTypeInfo
[characters
[0]] = characters
[0] <= MAX_LATIN
? "CanonicalizeLatinSelf:0" : "CanonicalizeLatinInvalid:0";
68 characters
.sort(function(x
,y
){return x
-y
;});
70 // If there are more than two characters, create an entry in characterSetInfo.
71 if (characters
.length
> 2) {
73 typeInfo
[characters
[i
]] = "CanonicalizeSet:" + characterSetInfo
.length
;
74 characterSetInfo
.push(characters
);
76 if (characters
[1] <= MAX_LATIN
)
77 throw new Error("sets with more than one latin character not supported!");
78 if (characters
[0] <= MAX_LATIN
) {
80 latinTypeInfo
[characters
[i
]] = "CanonicalizeLatinOther:" + characters
[0];
81 latinTypeInfo
[characters
[0]] = "CanonicalizeLatinSelf:0";
84 latinTypeInfo
[characters
[i
]] = "CanonicalizeLatinInvalid:0";
90 // We have a pair, mark alternating ranges, otherwise track whether this is the low or high partner.
91 var lo
= characters
[0];
92 var hi
= characters
[1];
95 var type
= lo
& 1 ? "CanonicalizeAlternatingUnaligned:0" : "CanonicalizeAlternatingAligned:0";
99 typeInfo
[lo
] = "CanonicalizeRangeLo:" + delta
;
100 typeInfo
[hi
] = "CanonicalizeRangeHi:" + delta
;
103 if (lo
> MAX_LATIN
) {
104 latinTypeInfo
[lo
] = "CanonicalizeLatinInvalid:0";
105 latinTypeInfo
[hi
] = "CanonicalizeLatinInvalid:0";
106 } else if (hi
> MAX_LATIN
) {
107 latinTypeInfo
[lo
] = "CanonicalizeLatinSelf:0";
108 latinTypeInfo
[hi
] = "CanonicalizeLatinOther:" + lo
;
110 if (delta
!= 0x20 || lo
& 0x20)
111 throw new Error("pairs of latin characters that don't mask with 0x20 not supported!");
112 latinTypeInfo
[lo
] = "CanonicalizeLatinMask0x20:0";
113 latinTypeInfo
[hi
] = "CanonicalizeLatinMask0x20:0";
118 // Pass 3: coallesce types into ranges.
119 for (var end
= 0; end
<= MAX_UCS2
; ++end
) {
121 var type
= typeInfo
[end
];
122 while (end
< MAX_UCS2
&& typeInfo
[end
+ 1] == type
)
124 rangeInfo
.push({begin:begin
, end:end
, type:type
});
127 var latinRangeInfo
= [];
128 // Pass 4: coallesce latin-1 types into ranges.
129 for (var end
= 0; end
<= MAX_UCS2
; ++end
) {
131 var type
= latinTypeInfo
[end
];
132 while (end
< MAX_UCS2
&& latinTypeInfo
[end
+ 1] == type
)
134 latinRangeInfo
.push({begin:begin
, end:end
, type:type
});
138 // Helper function to convert a number to a fixed width hex representation of a C uint16_t.
141 var s
= Number(x
).toString(16);
144 return "0x" + s
+ "u";
149 " * Copyright (C) 2012 Apple Inc. All rights reserved." + "\n" +
151 " * Redistribution and use in source and binary forms, with or without" + "\n" +
152 " * modification, are permitted provided that the following conditions" + "\n" +
153 " * are met:" + "\n" +
154 " * 1. Redistributions of source code must retain the above copyright" + "\n" +
155 " * notice, this list of conditions and the following disclaimer." + "\n" +
156 " * 2. Redistributions in binary form must reproduce the above copyright" + "\n" +
157 " * notice, this list of conditions and the following disclaimer in the" + "\n" +
158 " * documentation and/or other materials provided with the distribution." + "\n" +
160 " * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY" + "\n" +
161 " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE" + "\n" +
162 " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR" + "\n" +
163 " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR" + "\n" +
164 " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL," + "\n" +
165 " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO," + "\n" +
166 " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR" + "\n" +
167 " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY" + "\n" +
168 " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT" + "\n" +
169 " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE" + "\n" +
170 " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. " + "\n" +
175 print("// DO NOT EDIT! - this file autogenerated by YarrCanonicalizeUCS2.js");
177 print('#include "config.h"');
178 print('#include "YarrCanonicalizeUCS2.h"');
180 print("namespace JSC { namespace Yarr {");
182 print("#include <stdint.h>");
185 for (i
in characterSetInfo
) {
187 var set = characterSetInfo
[i
];
189 characters
+= hex(set[j
]) + ", ";
190 print("uint16_t ucs2CharacterSet" + i
+ "[] = { " + characters
+ "0 };");
193 print("static const size_t UCS2_CANONICALIZATION_SETS = " + characterSetInfo
.length
+ ";");
194 print("uint16_t* characterSetInfo[UCS2_CANONICALIZATION_SETS] = {");
195 for (i
in characterSetInfo
)
196 print(" ucs2CharacterSet" + i
+ ",");
199 print("const size_t UCS2_CANONICALIZATION_RANGES = " + rangeInfo
.length
+ ";");
200 print("UCS2CanonicalizationRange rangeInfo[UCS2_CANONICALIZATION_RANGES] = {");
201 for (i
in rangeInfo
) {
202 var info
= rangeInfo
[i
];
203 var typeAndValue
= info
.type
.split(':');
204 print(" { " + hex(info
.begin
) + ", " + hex(info
.end
) + ", " + hex(typeAndValue
[1]) + ", " + typeAndValue
[0] + " },");
208 print("const size_t LATIN_CANONICALIZATION_RANGES = " + latinRangeInfo
.length
+ ";");
209 print("LatinCanonicalizationRange latinRangeInfo[LATIN_CANONICALIZATION_RANGES] = {");
210 for (i
in latinRangeInfo
) {
211 var info
= latinRangeInfo
[i
];
212 var typeAndValue
= info
.type
.split(':');
213 print(" { " + hex(info
.begin
) + ", " + hex(info
.end
) + ", " + hex(typeAndValue
[1]) + ", " + typeAndValue
[0] + " },");
217 print("} } // JSC::Yarr");