2 * Copyright (C) 2012 Apple Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 #ifndef YarrCanonicalizeUCS2_H
27 #define YarrCanonicalizeUCS2_H
30 #include <unicode/utypes.h>
32 namespace JSC
{ namespace Yarr
{
34 // This set of data (autogenerated using YarrCanonicalizeUCS2.js into YarrCanonicalizeUCS2.cpp)
35 // provides information for each UCS2 code point as to the set of code points that it should
36 // match under the ES5.1 case insensitive RegExp matching rules, specified in 15.10.2.8.
37 enum UCS2CanonicalizationType
{
38 CanonicalizeUnique
, // No canonically equal values, e.g. 0x0.
39 CanonicalizeSet
, // Value indicates a set in characterSetInfo.
40 CanonicalizeRangeLo
, // Value is positive delta to pair, E.g. 0x41 has value 0x20, -> 0x61.
41 CanonicalizeRangeHi
, // Value is positive delta to pair, E.g. 0x61 has value 0x20, -> 0x41.
42 CanonicalizeAlternatingAligned
, // Aligned consequtive pair, e.g. 0x1f4,0x1f5.
43 CanonicalizeAlternatingUnaligned
, // Unaligned consequtive pair, e.g. 0x241,0x242.
45 struct UCS2CanonicalizationRange
{ uint16_t begin
, end
, value
, type
; };
46 extern const size_t UCS2_CANONICALIZATION_RANGES
;
47 extern const uint16_t* characterSetInfo
[];
48 extern const UCS2CanonicalizationRange rangeInfo
[];
50 // This table is similar to the full rangeInfo table, however this maps from UCS2 codepoints to
51 // the set of Latin1 codepoints that could match.
52 enum LatinCanonicalizationType
{
53 CanonicalizeLatinSelf
, // This character is in the Latin1 range, but has no canonical equivalent in the range.
54 CanonicalizeLatinMask0x20
, // One of a pair of characters, under the mask 0x20.
55 CanonicalizeLatinOther
, // This character is not in the Latin1 range, but canonicalizes to another that is.
56 CanonicalizeLatinInvalid
, // Cannot match against Latin1 input.
58 struct LatinCanonicalizationRange
{ uint16_t begin
, end
, value
, type
; };
59 extern const size_t LATIN_CANONICALIZATION_RANGES
;
60 extern LatinCanonicalizationRange latinRangeInfo
[];
62 // This searches in log2 time over ~364 entries, so should typically result in 8 compares.
63 inline const UCS2CanonicalizationRange
* rangeInfoFor(UChar ch
)
65 const UCS2CanonicalizationRange
* info
= rangeInfo
;
66 size_t entries
= UCS2_CANONICALIZATION_RANGES
;
69 size_t candidate
= entries
>> 1;
70 const UCS2CanonicalizationRange
* candidateInfo
= info
+ candidate
;
71 if (ch
< candidateInfo
->begin
)
73 else if (ch
<= candidateInfo
->end
)
76 info
= candidateInfo
+ 1;
77 entries
-= (candidate
+ 1);
82 // Should only be called for characters that have one canonically matching value.
83 inline UChar
getCanonicalPair(const UCS2CanonicalizationRange
* info
, UChar ch
)
85 ASSERT(ch
>= info
->begin
&& ch
<= info
->end
);
87 case CanonicalizeRangeLo
:
88 return ch
+ info
->value
;
89 case CanonicalizeRangeHi
:
90 return ch
- info
->value
;
91 case CanonicalizeAlternatingAligned
:
93 case CanonicalizeAlternatingUnaligned
:
94 return ((ch
- 1) ^ 1) + 1;
96 RELEASE_ASSERT_NOT_REACHED();
98 RELEASE_ASSERT_NOT_REACHED();
102 // Returns true if no other UCS2 codepoint can match this value.
103 inline bool isCanonicallyUnique(UChar ch
)
105 return rangeInfoFor(ch
)->type
== CanonicalizeUnique
;
108 // Returns true if values are equal, under the canonicalization rules.
109 inline bool areCanonicallyEquivalent(UChar a
, UChar b
)
111 const UCS2CanonicalizationRange
* info
= rangeInfoFor(a
);
112 switch (info
->type
) {
113 case CanonicalizeUnique
:
115 case CanonicalizeSet
: {
116 for (const uint16_t* set
= characterSetInfo
[info
->value
]; (a
= *set
); ++set
) {
122 case CanonicalizeRangeLo
:
123 return (a
== b
) || (a
+ info
->value
== b
);
124 case CanonicalizeRangeHi
:
125 return (a
== b
) || (a
- info
->value
== b
);
126 case CanonicalizeAlternatingAligned
:
127 return (a
| 1) == (b
| 1);
128 case CanonicalizeAlternatingUnaligned
:
129 return ((a
- 1) | 1) == ((b
- 1) | 1);
132 RELEASE_ASSERT_NOT_REACHED();