]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/bmpset.cpp
ICU-400.37.tar.gz
[apple/icu.git] / icuSources / common / bmpset.cpp
CommitLineData
46f4442e
A
1/*
2******************************************************************************
3*
4* Copyright (C) 2007-2008, International Business Machines
5* Corporation and others. All Rights Reserved.
6*
7******************************************************************************
8* file name: bmpset.cpp
9* encoding: US-ASCII
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2007jan29
14* created by: Markus W. Scherer
15*/
16
17#include "unicode/utypes.h"
18#include "unicode/uniset.h"
19#include "cmemory.h"
20#include "bmpset.h"
21
22U_NAMESPACE_BEGIN
23
24BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
25 list(parentList), listLength(parentListLength) {
26 uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
27 uprv_memset(table7FF, 0, sizeof(table7FF));
28 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
29
30 /*
31 * Set the list indexes for binary searches for
32 * U+0800, U+1000, U+2000, .., U+F000, U+10000.
33 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
34 * looked up in the bit tables.
35 * The last pair of indexes is for finding supplementary code points.
36 */
37 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
38 int32_t i;
39 for(i=1; i<=0x10; ++i) {
40 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
41 }
42 list4kStarts[0x11]=listLength-1;
43
44 initBits();
45 overrideIllegal();
46}
47
48BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
49 list(newParentList), listLength(newParentListLength) {
50 uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
51 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
52 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
53 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
54}
55
56BMPSet::~BMPSet() {
57}
58
59/*
60 * Set bits in a bit rectangle in "vertical" bit organization.
61 * start<limit<=0x800
62 */
63static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
64 int32_t lead=start>>6;
65 int32_t trail=start&0x3f;
66
67 // Set one bit indicating an all-one block.
68 uint32_t bits=(uint32_t)1<<lead;
69 if((start+1)==limit) { // Single-character shortcut.
70 table[trail]|=bits;
71 return;
72 }
73
74 int32_t limitLead=limit>>6;
75 int32_t limitTrail=limit&0x3f;
76
77 if(lead==limitLead) {
78 // Partial vertical bit column.
79 while(trail<limitTrail) {
80 table[trail++]|=bits;
81 }
82 } else {
83 // Partial vertical bit column,
84 // followed by a bit rectangle,
85 // followed by another partial vertical bit column.
86 if(trail>0) {
87 do {
88 table[trail++]|=bits;
89 } while(trail<64);
90 ++lead;
91 }
92 if(lead<limitLead) {
93 bits=~((1<<lead)-1);
94 if(limitLead<0x20) {
95 bits&=(1<<limitLead)-1;
96 }
97 for(trail=0; trail<64; ++trail) {
98 table[trail]|=bits;
99 }
100 }
101 bits=1<<limitLead;
102 for(trail=0; trail<limitTrail; ++trail) {
103 table[trail]|=bits;
104 }
105 }
106}
107
108void BMPSet::initBits() {
109 UChar32 start, limit;
110 int32_t listIndex=0;
111
112 // Set asciiBytes[].
113 do {
114 start=list[listIndex++];
115 if(listIndex<listLength) {
116 limit=list[listIndex++];
117 } else {
118 limit=0x110000;
119 }
120 if(start>=0x80) {
121 break;
122 }
123 do {
124 asciiBytes[start++]=1;
125 } while(start<limit && start<0x80);
126 } while(limit<=0x80);
127
128 // Set table7FF[].
129 while(start<0x800) {
130 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
131 if(limit>0x800) {
132 start=0x800;
133 break;
134 }
135
136 start=list[listIndex++];
137 if(listIndex<listLength) {
138 limit=list[listIndex++];
139 } else {
140 limit=0x110000;
141 }
142 }
143
144 // Set bmpBlockBits[].
145 int32_t minStart=0x800;
146 while(start<0x10000) {
147 if(limit>0x10000) {
148 limit=0x10000;
149 }
150
151 if(start<minStart) {
152 start=minStart;
153 }
154 if(start<limit) { // Else: Another range entirely in a known mixed-value block.
155 if(start&0x3f) {
156 // Mixed-value block of 64 code points.
157 start>>=6;
158 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
159 start=(start+1)<<6; // Round up to the next block boundary.
160 minStart=start; // Ignore further ranges in this block.
161 }
162 if(start<limit) {
163 if(start<(limit&~0x3f)) {
164 // Multiple all-ones blocks of 64 code points each.
165 set32x64Bits(bmpBlockBits, start>>6, limit>>6);
166 }
167
168 if(limit&0x3f) {
169 // Mixed-value block of 64 code points.
170 limit>>=6;
171 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
172 limit=(limit+1)<<6; // Round up to the next block boundary.
173 minStart=limit; // Ignore further ranges in this block.
174 }
175 }
176 }
177
178 if(limit==0x10000) {
179 break;
180 }
181
182 start=list[listIndex++];
183 if(listIndex<listLength) {
184 limit=list[listIndex++];
185 } else {
186 limit=0x110000;
187 }
188 }
189}
190
191/*
192 * Override some bits and bytes to the result of contains(FFFD)
193 * for faster validity checking at runtime.
194 * No need to set 0 values where they were reset to 0 in the constructor
195 * and not modified by initBits().
196 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
197 * Need to set 0 values for surrogates D800..DFFF.
198 */
199void BMPSet::overrideIllegal() {
200 uint32_t bits, mask;
201 int32_t i;
202
203 if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
204 // contains(FFFD)==TRUE
205 for(i=0x80; i<0xc0; ++i) {
206 asciiBytes[i]=1;
207 }
208
209 bits=3; // Lead bytes 0xC0 and 0xC1.
210 for(i=0; i<64; ++i) {
211 table7FF[i]|=bits;
212 }
213
214 bits=1; // Lead byte 0xE0.
215 for(i=0; i<32; ++i) { // First half of 4k block.
216 bmpBlockBits[i]|=bits;
217 }
218
219 mask=~(0x10001<<0xd); // Lead byte 0xED.
220 bits=1<<0xd;
221 for(i=32; i<64; ++i) { // Second half of 4k block.
222 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
223 }
224 } else {
225 // contains(FFFD)==FALSE
226 mask=~(0x10001<<0xd); // Lead byte 0xED.
227 for(i=32; i<64; ++i) { // Second half of 4k block.
228 bmpBlockBits[i]&=mask;
229 }
230 }
231}
232
233int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
234 /* Examples:
235 findCodePoint(c)
236 set list[] c=0 1 3 4 7 8
237 === ============== ===========
238 [] [110000] 0 0 0 0 0 0
239 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
240 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
241 [:Any:] [0, 110000] 1 1 1 1 1 1
242 */
243
244 // Return the smallest i such that c < list[i]. Assume
245 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
246 if (c < list[lo])
247 return lo;
248 // High runner test. c is often after the last range, so an
249 // initial check for this condition pays off.
250 if (lo >= hi || c >= list[hi-1])
251 return hi;
252 // invariant: c >= list[lo]
253 // invariant: c < list[hi]
254 for (;;) {
255 int32_t i = (lo + hi) >> 1;
256 if (i == lo) {
257 break; // Found!
258 } else if (c < list[i]) {
259 hi = i;
260 } else {
261 lo = i;
262 }
263 }
264 return hi;
265}
266
267UBool
268BMPSet::contains(UChar32 c) const {
269 if((uint32_t)c<=0x7f) {
270 return (UBool)asciiBytes[c];
271 } else if((uint32_t)c<=0x7ff) {
272 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
273 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
274 int lead=c>>12;
275 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
276 if(twoBits<=1) {
277 // All 64 code points with the same bits 15..6
278 // are either in the set or not.
279 return (UBool)twoBits;
280 } else {
281 // Look up the code point in its 4k block of code points.
282 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
283 }
284 } else if((uint32_t)c<=0x10ffff) {
285 // surrogate or supplementary code point
286 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
287 } else {
288 // Out-of-range code points get FALSE, consistent with long-standing
289 // behavior of UnicodeSet::contains(c).
290 return FALSE;
291 }
292}
293
294/*
295 * Check for sufficient length for trail unit for each surrogate pair.
296 * Handle single surrogates as surrogate code points as usual in ICU.
297 */
298const UChar *
299BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
300 UChar c, c2;
301
302 if(spanCondition) {
303 // span
304 do {
305 c=*s;
306 if(c<=0x7f) {
307 if(!asciiBytes[c]) {
308 break;
309 }
310 } else if(c<=0x7ff) {
311 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
312 break;
313 }
314 } else if(c<0xd800 || c>=0xe000) {
315 int lead=c>>12;
316 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
317 if(twoBits<=1) {
318 // All 64 code points with the same bits 15..6
319 // are either in the set or not.
320 if(twoBits==0) {
321 break;
322 }
323 } else {
324 // Look up the code point in its 4k block of code points.
325 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
326 break;
327 }
328 }
329 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
330 // surrogate code point
331 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
332 break;
333 }
334 } else {
335 // surrogate pair
336 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
337 break;
338 }
339 ++s;
340 }
341 } while(++s<limit);
342 } else {
343 // span not
344 do {
345 c=*s;
346 if(c<=0x7f) {
347 if(asciiBytes[c]) {
348 break;
349 }
350 } else if(c<=0x7ff) {
351 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
352 break;
353 }
354 } else if(c<0xd800 || c>=0xe000) {
355 int lead=c>>12;
356 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
357 if(twoBits<=1) {
358 // All 64 code points with the same bits 15..6
359 // are either in the set or not.
360 if(twoBits!=0) {
361 break;
362 }
363 } else {
364 // Look up the code point in its 4k block of code points.
365 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
366 break;
367 }
368 }
369 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
370 // surrogate code point
371 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
372 break;
373 }
374 } else {
375 // surrogate pair
376 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
377 break;
378 }
379 ++s;
380 }
381 } while(++s<limit);
382 }
383 return s;
384}
385
386/* Symmetrical with span(). */
387const UChar *
388BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
389 UChar c, c2;
390
391 if(spanCondition) {
392 // span
393 for(;;) {
394 c=*(--limit);
395 if(c<=0x7f) {
396 if(!asciiBytes[c]) {
397 break;
398 }
399 } else if(c<=0x7ff) {
400 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
401 break;
402 }
403 } else if(c<0xd800 || c>=0xe000) {
404 int lead=c>>12;
405 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
406 if(twoBits<=1) {
407 // All 64 code points with the same bits 15..6
408 // are either in the set or not.
409 if(twoBits==0) {
410 break;
411 }
412 } else {
413 // Look up the code point in its 4k block of code points.
414 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
415 break;
416 }
417 }
418 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
419 // surrogate code point
420 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
421 break;
422 }
423 } else {
424 // surrogate pair
425 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
426 break;
427 }
428 --limit;
429 }
430 if(s==limit) {
431 return s;
432 }
433 }
434 } else {
435 // span not
436 for(;;) {
437 c=*(--limit);
438 if(c<=0x7f) {
439 if(asciiBytes[c]) {
440 break;
441 }
442 } else if(c<=0x7ff) {
443 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
444 break;
445 }
446 } else if(c<0xd800 || c>=0xe000) {
447 int lead=c>>12;
448 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
449 if(twoBits<=1) {
450 // All 64 code points with the same bits 15..6
451 // are either in the set or not.
452 if(twoBits!=0) {
453 break;
454 }
455 } else {
456 // Look up the code point in its 4k block of code points.
457 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
458 break;
459 }
460 }
461 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
462 // surrogate code point
463 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
464 break;
465 }
466 } else {
467 // surrogate pair
468 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
469 break;
470 }
471 --limit;
472 }
473 if(s==limit) {
474 return s;
475 }
476 }
477 }
478 return limit+1;
479}
480
481/*
482 * Precheck for sufficient trail bytes at end of string only once per span.
483 * Check validity.
484 */
485const uint8_t *
486BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
487 const uint8_t *limit=s+length;
488 uint8_t b=*s;
489 if((int8_t)b>=0) {
490 // Initial all-ASCII span.
491 if(spanCondition) {
492 do {
493 if(!asciiBytes[b] || ++s==limit) {
494 return s;
495 }
496 b=*s;
497 } while((int8_t)b>=0);
498 } else {
499 do {
500 if(asciiBytes[b] || ++s==limit) {
501 return s;
502 }
503 b=*s;
504 } while((int8_t)b>=0);
505 }
506 length=(int32_t)(limit-s);
507 }
508
509 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
510 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
511 }
512
513 const uint8_t *limit0=limit;
514
515 /*
516 * Make sure that the last 1/2/3/4-byte sequence before limit is complete
517 * or runs into a lead byte.
518 * In the span loop compare s with limit only once
519 * per multi-byte character.
520 *
521 * Give a trailing illegal sequence the same value as the result of contains(FFFD),
522 * including it if that is part of the span, otherwise set limit0 to before
523 * the truncated sequence.
524 */
525 b=*(limit-1);
526 if((int8_t)b<0) {
527 // b>=0x80: lead or trail byte
528 if(b<0xc0) {
529 // single trail byte, check for preceding 3- or 4-byte lead byte
530 if(length>=2 && (b=*(limit-2))>=0xe0) {
531 limit-=2;
532 if(asciiBytes[0x80]!=spanCondition) {
533 limit0=limit;
534 }
535 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
536 // 4-byte lead byte with only two trail bytes
537 limit-=3;
538 if(asciiBytes[0x80]!=spanCondition) {
539 limit0=limit;
540 }
541 }
542 } else {
543 // lead byte with no trail bytes
544 --limit;
545 if(asciiBytes[0x80]!=spanCondition) {
546 limit0=limit;
547 }
548 }
549 }
550
551 uint8_t t1, t2, t3;
552
553 while(s<limit) {
554 b=*s;
555 if(b<0xc0) {
556 // ASCII; or trail bytes with the result of contains(FFFD).
557 if(spanCondition) {
558 do {
559 if(!asciiBytes[b]) {
560 return s;
561 } else if(++s==limit) {
562 return limit0;
563 }
564 b=*s;
565 } while(b<0xc0);
566 } else {
567 do {
568 if(asciiBytes[b]) {
569 return s;
570 } else if(++s==limit) {
571 return limit0;
572 }
573 b=*s;
574 } while(b<0xc0);
575 }
576 }
577 ++s; // Advance past the lead byte.
578 if(b>=0xe0) {
579 if(b<0xf0) {
580 if( /* handle U+0000..U+FFFF inline */
581 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
582 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
583 ) {
584 b&=0xf;
585 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
586 if(twoBits<=1) {
587 // All 64 code points with this lead byte and middle trail byte
588 // are either in the set or not.
589 if(twoBits!=(uint32_t)spanCondition) {
590 return s-1;
591 }
592 } else {
593 // Look up the code point in its 4k block of code points.
594 UChar32 c=(b<<12)|(t1<<6)|t2;
595 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
596 return s-1;
597 }
598 }
599 s+=2;
600 continue;
601 }
602 } else if( /* handle U+10000..U+10FFFF inline */
603 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
604 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
605 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
606 ) {
607 // Give an illegal sequence the same value as the result of contains(FFFD).
608 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
609 if( ( (0x10000<=c && c<=0x10ffff) ?
610 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
611 asciiBytes[0x80]
612 ) != spanCondition
613 ) {
614 return s-1;
615 }
616 s+=3;
617 continue;
618 }
619 } else /* 0xc0<=b<0xe0 */ {
620 if( /* handle U+0000..U+07FF inline */
621 (t1=(uint8_t)(*s-0x80)) <= 0x3f
622 ) {
623 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
624 return s-1;
625 }
626 ++s;
627 continue;
628 }
629 }
630
631 // Give an illegal sequence the same value as the result of contains(FFFD).
632 // Handle each byte of an illegal sequence separately to simplify the code;
633 // no need to optimize error handling.
634 if(asciiBytes[0x80]!=spanCondition) {
635 return s-1;
636 }
637 }
638
639 return limit0;
640}
641
642/*
643 * While going backwards through UTF-8 optimize only for ASCII.
644 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
645 * possible to tell from the last byte in a multi-byte sequence how many
646 * preceding bytes there should be. Therefore, going backwards through UTF-8
647 * is much harder than going forward.
648 */
649int32_t
650BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
651 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
652 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
653 }
654
655 uint8_t b;
656
657 do {
658 b=s[--length];
659 if((int8_t)b>=0) {
660 // ASCII sub-span
661 if(spanCondition) {
662 do {
663 if(!asciiBytes[b]) {
664 return length+1;
665 } else if(length==0) {
666 return 0;
667 }
668 b=s[--length];
669 } while((int8_t)b>=0);
670 } else {
671 do {
672 if(asciiBytes[b]) {
673 return length+1;
674 } else if(length==0) {
675 return 0;
676 }
677 b=s[--length];
678 } while((int8_t)b>=0);
679 }
680 }
681
682 int32_t prev=length;
683 UChar32 c;
684 if(b<0xc0) {
685 // trail byte: collect a multi-byte character
686 c=utf8_prevCharSafeBody(s, 0, &length, b, -1);
687 if(c<0) {
688 c=0xfffd;
689 }
690 } else {
691 // lead byte in last-trail position
692 c=0xfffd;
693 }
694 // c is a valid code point, not ASCII, not a surrogate
695 if(c<=0x7ff) {
696 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
697 return prev+1;
698 }
699 } else if(c<=0xffff) {
700 int lead=c>>12;
701 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
702 if(twoBits<=1) {
703 // All 64 code points with the same bits 15..6
704 // are either in the set or not.
705 if(twoBits!=(uint32_t)spanCondition) {
706 return prev+1;
707 }
708 } else {
709 // Look up the code point in its 4k block of code points.
710 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
711 return prev+1;
712 }
713 }
714 } else {
715 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
716 return prev+1;
717 }
718 }
719 } while(length>0);
720 return 0;
721}
722
723U_NAMESPACE_END