]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/bmpset.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / common / bmpset.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
46f4442e
A
3/*
4******************************************************************************
5*
51004dcb 6* Copyright (C) 2007-2012, International Business Machines
46f4442e
A
7* Corporation and others. All Rights Reserved.
8*
9******************************************************************************
10* file name: bmpset.cpp
f3c0d7a5 11* encoding: UTF-8
46f4442e
A
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2007jan29
16* created by: Markus W. Scherer
17*/
18
19#include "unicode/utypes.h"
20#include "unicode/uniset.h"
4388f060
A
21#include "unicode/utf8.h"
22#include "unicode/utf16.h"
46f4442e
A
23#include "cmemory.h"
24#include "bmpset.h"
4388f060 25#include "uassert.h"
46f4442e
A
26
27U_NAMESPACE_BEGIN
28
29BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
30 list(parentList), listLength(parentListLength) {
0f5d89e8 31 uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
46f4442e
A
32 uprv_memset(table7FF, 0, sizeof(table7FF));
33 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
34
35 /*
36 * Set the list indexes for binary searches for
37 * U+0800, U+1000, U+2000, .., U+F000, U+10000.
38 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
39 * looked up in the bit tables.
40 * The last pair of indexes is for finding supplementary code points.
41 */
42 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
43 int32_t i;
44 for(i=1; i<=0x10; ++i) {
45 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
46 }
47 list4kStarts[0x11]=listLength-1;
0f5d89e8 48 containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
46f4442e
A
49
50 initBits();
51 overrideIllegal();
52}
53
54BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
0f5d89e8 55 containsFFFD(otherBMPSet.containsFFFD),
46f4442e 56 list(newParentList), listLength(newParentListLength) {
0f5d89e8 57 uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
46f4442e
A
58 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
59 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
60 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
61}
62
63BMPSet::~BMPSet() {
64}
65
66/*
67 * Set bits in a bit rectangle in "vertical" bit organization.
68 * start<limit<=0x800
69 */
70static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
4388f060
A
71 U_ASSERT(start<limit);
72 U_ASSERT(limit<=0x800);
73
74 int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
75 int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
46f4442e
A
76
77 // Set one bit indicating an all-one block.
78 uint32_t bits=(uint32_t)1<<lead;
79 if((start+1)==limit) { // Single-character shortcut.
80 table[trail]|=bits;
81 return;
82 }
83
84 int32_t limitLead=limit>>6;
85 int32_t limitTrail=limit&0x3f;
86
87 if(lead==limitLead) {
88 // Partial vertical bit column.
89 while(trail<limitTrail) {
90 table[trail++]|=bits;
91 }
92 } else {
93 // Partial vertical bit column,
94 // followed by a bit rectangle,
95 // followed by another partial vertical bit column.
96 if(trail>0) {
97 do {
98 table[trail++]|=bits;
99 } while(trail<64);
100 ++lead;
101 }
102 if(lead<limitLead) {
0f5d89e8 103 bits=~(((unsigned)1<<lead)-1);
46f4442e 104 if(limitLead<0x20) {
0f5d89e8 105 bits&=((unsigned)1<<limitLead)-1;
46f4442e
A
106 }
107 for(trail=0; trail<64; ++trail) {
108 table[trail]|=bits;
109 }
110 }
4388f060
A
111 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
112 // In that case, bits=1<<limitLead is undefined but the bits value
113 // is not used because trail<limitTrail is already false.
51004dcb 114 bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
46f4442e
A
115 for(trail=0; trail<limitTrail; ++trail) {
116 table[trail]|=bits;
117 }
118 }
119}
120
121void BMPSet::initBits() {
122 UChar32 start, limit;
123 int32_t listIndex=0;
124
0f5d89e8 125 // Set latin1Contains[].
46f4442e
A
126 do {
127 start=list[listIndex++];
128 if(listIndex<listLength) {
129 limit=list[listIndex++];
130 } else {
131 limit=0x110000;
132 }
0f5d89e8 133 if(start>=0x100) {
46f4442e
A
134 break;
135 }
136 do {
0f5d89e8
A
137 latin1Contains[start++]=1;
138 } while(start<limit && start<0x100);
139 } while(limit<=0x100);
140
141 // Find the first range overlapping with (or after) 80..FF again,
142 // to include them in table7FF as well.
143 for(listIndex=0;;) {
144 start=list[listIndex++];
145 if(listIndex<listLength) {
146 limit=list[listIndex++];
147 } else {
148 limit=0x110000;
149 }
150 if(limit>0x80) {
151 if(start<0x80) {
152 start=0x80;
153 }
154 break;
155 }
156 }
46f4442e
A
157
158 // Set table7FF[].
159 while(start<0x800) {
160 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
161 if(limit>0x800) {
162 start=0x800;
163 break;
164 }
165
166 start=list[listIndex++];
167 if(listIndex<listLength) {
168 limit=list[listIndex++];
169 } else {
170 limit=0x110000;
171 }
172 }
173
174 // Set bmpBlockBits[].
175 int32_t minStart=0x800;
176 while(start<0x10000) {
177 if(limit>0x10000) {
178 limit=0x10000;
179 }
180
181 if(start<minStart) {
182 start=minStart;
183 }
184 if(start<limit) { // Else: Another range entirely in a known mixed-value block.
185 if(start&0x3f) {
186 // Mixed-value block of 64 code points.
187 start>>=6;
188 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
189 start=(start+1)<<6; // Round up to the next block boundary.
190 minStart=start; // Ignore further ranges in this block.
191 }
192 if(start<limit) {
193 if(start<(limit&~0x3f)) {
194 // Multiple all-ones blocks of 64 code points each.
195 set32x64Bits(bmpBlockBits, start>>6, limit>>6);
196 }
197
198 if(limit&0x3f) {
199 // Mixed-value block of 64 code points.
200 limit>>=6;
201 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
202 limit=(limit+1)<<6; // Round up to the next block boundary.
203 minStart=limit; // Ignore further ranges in this block.
204 }
205 }
206 }
207
208 if(limit==0x10000) {
209 break;
210 }
211
212 start=list[listIndex++];
213 if(listIndex<listLength) {
214 limit=list[listIndex++];
215 } else {
216 limit=0x110000;
217 }
218 }
219}
220
221/*
222 * Override some bits and bytes to the result of contains(FFFD)
223 * for faster validity checking at runtime.
224 * No need to set 0 values where they were reset to 0 in the constructor
225 * and not modified by initBits().
0f5d89e8 226 * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
46f4442e
A
227 * Need to set 0 values for surrogates D800..DFFF.
228 */
229void BMPSet::overrideIllegal() {
230 uint32_t bits, mask;
231 int32_t i;
232
0f5d89e8 233 if(containsFFFD) {
46f4442e
A
234 bits=3; // Lead bytes 0xC0 and 0xC1.
235 for(i=0; i<64; ++i) {
236 table7FF[i]|=bits;
237 }
238
239 bits=1; // Lead byte 0xE0.
240 for(i=0; i<32; ++i) { // First half of 4k block.
241 bmpBlockBits[i]|=bits;
242 }
243
3d1f044b 244 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
46f4442e
A
245 bits=1<<0xd;
246 for(i=32; i<64; ++i) { // Second half of 4k block.
247 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
248 }
249 } else {
3d1f044b 250 mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED.
46f4442e
A
251 for(i=32; i<64; ++i) { // Second half of 4k block.
252 bmpBlockBits[i]&=mask;
253 }
254 }
255}
256
257int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
258 /* Examples:
259 findCodePoint(c)
260 set list[] c=0 1 3 4 7 8
261 === ============== ===========
262 [] [110000] 0 0 0 0 0 0
263 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
264 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
265 [:Any:] [0, 110000] 1 1 1 1 1 1
266 */
267
268 // Return the smallest i such that c < list[i]. Assume
269 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
270 if (c < list[lo])
271 return lo;
272 // High runner test. c is often after the last range, so an
273 // initial check for this condition pays off.
274 if (lo >= hi || c >= list[hi-1])
275 return hi;
276 // invariant: c >= list[lo]
277 // invariant: c < list[hi]
278 for (;;) {
279 int32_t i = (lo + hi) >> 1;
280 if (i == lo) {
281 break; // Found!
282 } else if (c < list[i]) {
283 hi = i;
284 } else {
285 lo = i;
286 }
287 }
288 return hi;
289}
290
291UBool
292BMPSet::contains(UChar32 c) const {
0f5d89e8
A
293 if((uint32_t)c<=0xff) {
294 return (UBool)latin1Contains[c];
46f4442e
A
295 } else if((uint32_t)c<=0x7ff) {
296 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
297 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
298 int lead=c>>12;
299 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
300 if(twoBits<=1) {
301 // All 64 code points with the same bits 15..6
302 // are either in the set or not.
303 return (UBool)twoBits;
304 } else {
305 // Look up the code point in its 4k block of code points.
306 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
307 }
308 } else if((uint32_t)c<=0x10ffff) {
309 // surrogate or supplementary code point
310 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
311 } else {
312 // Out-of-range code points get FALSE, consistent with long-standing
313 // behavior of UnicodeSet::contains(c).
314 return FALSE;
315 }
316}
317
318/*
319 * Check for sufficient length for trail unit for each surrogate pair.
320 * Handle single surrogates as surrogate code points as usual in ICU.
321 */
322const UChar *
323BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
324 UChar c, c2;
325
326 if(spanCondition) {
327 // span
328 do {
329 c=*s;
0f5d89e8
A
330 if(c<=0xff) {
331 if(!latin1Contains[c]) {
46f4442e
A
332 break;
333 }
334 } else if(c<=0x7ff) {
335 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
336 break;
337 }
338 } else if(c<0xd800 || c>=0xe000) {
339 int lead=c>>12;
340 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
341 if(twoBits<=1) {
342 // All 64 code points with the same bits 15..6
343 // are either in the set or not.
344 if(twoBits==0) {
345 break;
346 }
347 } else {
348 // Look up the code point in its 4k block of code points.
349 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
350 break;
351 }
352 }
353 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
354 // surrogate code point
355 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
356 break;
357 }
358 } else {
359 // surrogate pair
360 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
361 break;
362 }
363 ++s;
364 }
365 } while(++s<limit);
366 } else {
367 // span not
368 do {
369 c=*s;
0f5d89e8
A
370 if(c<=0xff) {
371 if(latin1Contains[c]) {
46f4442e
A
372 break;
373 }
374 } else if(c<=0x7ff) {
375 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
376 break;
377 }
378 } else if(c<0xd800 || c>=0xe000) {
379 int lead=c>>12;
380 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
381 if(twoBits<=1) {
382 // All 64 code points with the same bits 15..6
383 // are either in the set or not.
384 if(twoBits!=0) {
385 break;
386 }
387 } else {
388 // Look up the code point in its 4k block of code points.
389 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
390 break;
391 }
392 }
393 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
394 // surrogate code point
395 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
396 break;
397 }
398 } else {
399 // surrogate pair
400 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
401 break;
402 }
403 ++s;
404 }
405 } while(++s<limit);
406 }
407 return s;
408}
409
410/* Symmetrical with span(). */
411const UChar *
412BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
413 UChar c, c2;
414
415 if(spanCondition) {
416 // span
417 for(;;) {
418 c=*(--limit);
0f5d89e8
A
419 if(c<=0xff) {
420 if(!latin1Contains[c]) {
46f4442e
A
421 break;
422 }
423 } else if(c<=0x7ff) {
424 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
425 break;
426 }
427 } else if(c<0xd800 || c>=0xe000) {
428 int lead=c>>12;
429 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
430 if(twoBits<=1) {
431 // All 64 code points with the same bits 15..6
432 // are either in the set or not.
433 if(twoBits==0) {
434 break;
435 }
436 } else {
437 // Look up the code point in its 4k block of code points.
438 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
439 break;
440 }
441 }
442 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
443 // surrogate code point
444 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
445 break;
446 }
447 } else {
448 // surrogate pair
449 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
450 break;
451 }
452 --limit;
453 }
454 if(s==limit) {
455 return s;
456 }
457 }
458 } else {
459 // span not
460 for(;;) {
461 c=*(--limit);
0f5d89e8
A
462 if(c<=0xff) {
463 if(latin1Contains[c]) {
46f4442e
A
464 break;
465 }
466 } else if(c<=0x7ff) {
467 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
468 break;
469 }
470 } else if(c<0xd800 || c>=0xe000) {
471 int lead=c>>12;
472 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
473 if(twoBits<=1) {
474 // All 64 code points with the same bits 15..6
475 // are either in the set or not.
476 if(twoBits!=0) {
477 break;
478 }
479 } else {
480 // Look up the code point in its 4k block of code points.
481 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
482 break;
483 }
484 }
485 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
486 // surrogate code point
487 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
488 break;
489 }
490 } else {
491 // surrogate pair
492 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
493 break;
494 }
495 --limit;
496 }
497 if(s==limit) {
498 return s;
499 }
500 }
501 }
502 return limit+1;
503}
504
505/*
506 * Precheck for sufficient trail bytes at end of string only once per span.
507 * Check validity.
508 */
509const uint8_t *
510BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
511 const uint8_t *limit=s+length;
512 uint8_t b=*s;
0f5d89e8 513 if(U8_IS_SINGLE(b)) {
46f4442e
A
514 // Initial all-ASCII span.
515 if(spanCondition) {
516 do {
0f5d89e8 517 if(!latin1Contains[b] || ++s==limit) {
46f4442e
A
518 return s;
519 }
520 b=*s;
0f5d89e8 521 } while(U8_IS_SINGLE(b));
46f4442e
A
522 } else {
523 do {
0f5d89e8 524 if(latin1Contains[b] || ++s==limit) {
46f4442e
A
525 return s;
526 }
527 b=*s;
0f5d89e8 528 } while(U8_IS_SINGLE(b));
46f4442e
A
529 }
530 length=(int32_t)(limit-s);
531 }
532
533 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
534 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
535 }
536
537 const uint8_t *limit0=limit;
538
539 /*
540 * Make sure that the last 1/2/3/4-byte sequence before limit is complete
541 * or runs into a lead byte.
542 * In the span loop compare s with limit only once
543 * per multi-byte character.
544 *
545 * Give a trailing illegal sequence the same value as the result of contains(FFFD),
546 * including it if that is part of the span, otherwise set limit0 to before
547 * the truncated sequence.
548 */
549 b=*(limit-1);
550 if((int8_t)b<0) {
551 // b>=0x80: lead or trail byte
552 if(b<0xc0) {
553 // single trail byte, check for preceding 3- or 4-byte lead byte
554 if(length>=2 && (b=*(limit-2))>=0xe0) {
555 limit-=2;
0f5d89e8 556 if(containsFFFD!=spanCondition) {
46f4442e
A
557 limit0=limit;
558 }
559 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
560 // 4-byte lead byte with only two trail bytes
561 limit-=3;
0f5d89e8 562 if(containsFFFD!=spanCondition) {
46f4442e
A
563 limit0=limit;
564 }
565 }
566 } else {
567 // lead byte with no trail bytes
568 --limit;
0f5d89e8 569 if(containsFFFD!=spanCondition) {
46f4442e
A
570 limit0=limit;
571 }
572 }
573 }
574
575 uint8_t t1, t2, t3;
576
577 while(s<limit) {
578 b=*s;
0f5d89e8
A
579 if(U8_IS_SINGLE(b)) {
580 // ASCII
46f4442e
A
581 if(spanCondition) {
582 do {
0f5d89e8 583 if(!latin1Contains[b]) {
46f4442e
A
584 return s;
585 } else if(++s==limit) {
586 return limit0;
587 }
588 b=*s;
0f5d89e8 589 } while(U8_IS_SINGLE(b));
46f4442e
A
590 } else {
591 do {
0f5d89e8 592 if(latin1Contains[b]) {
46f4442e
A
593 return s;
594 } else if(++s==limit) {
595 return limit0;
596 }
597 b=*s;
0f5d89e8 598 } while(U8_IS_SINGLE(b));
46f4442e
A
599 }
600 }
601 ++s; // Advance past the lead byte.
602 if(b>=0xe0) {
603 if(b<0xf0) {
604 if( /* handle U+0000..U+FFFF inline */
605 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
606 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
607 ) {
608 b&=0xf;
609 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
610 if(twoBits<=1) {
611 // All 64 code points with this lead byte and middle trail byte
612 // are either in the set or not.
613 if(twoBits!=(uint32_t)spanCondition) {
614 return s-1;
615 }
616 } else {
617 // Look up the code point in its 4k block of code points.
618 UChar32 c=(b<<12)|(t1<<6)|t2;
619 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
620 return s-1;
621 }
622 }
623 s+=2;
624 continue;
625 }
626 } else if( /* handle U+10000..U+10FFFF inline */
627 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
628 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
629 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
630 ) {
631 // Give an illegal sequence the same value as the result of contains(FFFD).
632 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
633 if( ( (0x10000<=c && c<=0x10ffff) ?
634 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
0f5d89e8 635 containsFFFD
46f4442e
A
636 ) != spanCondition
637 ) {
638 return s-1;
639 }
640 s+=3;
641 continue;
642 }
0f5d89e8 643 } else {
46f4442e 644 if( /* handle U+0000..U+07FF inline */
0f5d89e8 645 b>=0xc0 &&
46f4442e
A
646 (t1=(uint8_t)(*s-0x80)) <= 0x3f
647 ) {
648 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
649 return s-1;
650 }
651 ++s;
652 continue;
653 }
654 }
655
656 // Give an illegal sequence the same value as the result of contains(FFFD).
657 // Handle each byte of an illegal sequence separately to simplify the code;
658 // no need to optimize error handling.
0f5d89e8 659 if(containsFFFD!=spanCondition) {
46f4442e
A
660 return s-1;
661 }
662 }
663
664 return limit0;
665}
666
667/*
668 * While going backwards through UTF-8 optimize only for ASCII.
669 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
670 * possible to tell from the last byte in a multi-byte sequence how many
671 * preceding bytes there should be. Therefore, going backwards through UTF-8
672 * is much harder than going forward.
673 */
674int32_t
675BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
676 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
677 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
678 }
679
680 uint8_t b;
681
682 do {
683 b=s[--length];
0f5d89e8 684 if(U8_IS_SINGLE(b)) {
46f4442e
A
685 // ASCII sub-span
686 if(spanCondition) {
687 do {
0f5d89e8 688 if(!latin1Contains[b]) {
46f4442e
A
689 return length+1;
690 } else if(length==0) {
691 return 0;
692 }
693 b=s[--length];
0f5d89e8 694 } while(U8_IS_SINGLE(b));
46f4442e
A
695 } else {
696 do {
0f5d89e8 697 if(latin1Contains[b]) {
46f4442e
A
698 return length+1;
699 } else if(length==0) {
700 return 0;
701 }
702 b=s[--length];
0f5d89e8 703 } while(U8_IS_SINGLE(b));
46f4442e
A
704 }
705 }
706
707 int32_t prev=length;
708 UChar32 c;
51004dcb
A
709 // trail byte: collect a multi-byte character
710 // (or lead byte in last-trail position)
711 c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
46f4442e
A
712 // c is a valid code point, not ASCII, not a surrogate
713 if(c<=0x7ff) {
714 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
715 return prev+1;
716 }
717 } else if(c<=0xffff) {
718 int lead=c>>12;
719 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
720 if(twoBits<=1) {
721 // All 64 code points with the same bits 15..6
722 // are either in the set or not.
723 if(twoBits!=(uint32_t)spanCondition) {
724 return prev+1;
725 }
726 } else {
727 // Look up the code point in its 4k block of code points.
728 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
729 return prev+1;
730 }
731 }
732 } else {
733 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
734 return prev+1;
735 }
736 }
737 } while(length>0);
738 return 0;
739}
740
741U_NAMESPACE_END