]> git.saurik.com Git - apple/icu.git/blob - icuSources/i18n/affixpatternparser.cpp
ICU-62123.0.1.tar.gz
[apple/icu.git] / icuSources / i18n / affixpatternparser.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 * Copyright (C) 2015, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 * file name: affixpatternparser.cpp
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_FORMATTING
13
14 #include "unicode/dcfmtsym.h"
15 #include "unicode/plurrule.h"
16 #include "unicode/strenum.h"
17 #include "unicode/ucurr.h"
18 #include "unicode/ustring.h"
19 #include "affixpatternparser.h"
20 #include "charstr.h"
21 #include "precision.h"
22 #include "uassert.h"
23 #include "unistrappender.h"
24
25 static const UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
26
27 static const UChar gPercent = 0x25;
28 static const UChar gPerMill = 0x2030;
29 static const UChar gNegative = 0x2D;
30 static const UChar gPositive = 0x2B;
31
32 #define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
33
34 #define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
35
36 #define UNPACK_LONG(c) (((c) >> 8) & 0x80)
37
38 #define UNPACK_LENGTH(c) ((c) & 0xFF)
39
40 U_NAMESPACE_BEGIN
41
42 static int32_t
43 nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
44 if (buffer[idx] != 0x27 || idx + 1 == len) {
45 *token = buffer[idx];
46 return 1;
47 }
48 *token = buffer[idx + 1];
49 if (buffer[idx + 1] == 0xA4) {
50 int32_t i = 2;
51 for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
52 ;
53 return i;
54 }
55 return 2;
56 }
57
58 static int32_t
59 nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
60 *token = buffer[idx];
61 int32_t max;
62 switch (buffer[idx]) {
63 case 0x27:
64 max = 2;
65 break;
66 case 0xA4:
67 max = 3;
68 break;
69 default:
70 max = 1;
71 break;
72 }
73 int32_t i = 1;
74 for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
75 ;
76 return i;
77 }
78
79 CurrencyAffixInfo::CurrencyAffixInfo()
80 : fSymbol(gDefaultSymbols, 1),
81 fISO(gDefaultSymbols, 2),
82 fLong(DigitAffix(gDefaultSymbols, 3)),
83 fIsDefault(TRUE) {
84 }
85
86 void
87 CurrencyAffixInfo::set(
88 const char *locale,
89 const PluralRules *rules,
90 const UChar *currency,
91 UErrorCode &status) {
92 if (U_FAILURE(status)) {
93 return;
94 }
95 fIsDefault = FALSE;
96 if (currency == NULL) {
97 fSymbol.setTo(gDefaultSymbols, 1);
98 fISO.setTo(gDefaultSymbols, 2);
99 fLong.remove();
100 fLong.append(gDefaultSymbols, 3);
101 fIsDefault = TRUE;
102 return;
103 }
104 int32_t len;
105 UBool unusedIsChoice;
106 const UChar *symbol = ucurr_getName(
107 currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
108 &len, &status);
109 if (U_FAILURE(status)) {
110 return;
111 }
112 fSymbol.setTo(symbol, len);
113 fISO.setTo(currency, u_strlen(currency));
114 fLong.remove();
115 StringEnumeration* keywords = rules->getKeywords(status);
116 if (U_FAILURE(status)) {
117 return;
118 }
119 const UnicodeString* pluralCount;
120 while ((pluralCount = keywords->snext(status)) != NULL) {
121 CharString pCount;
122 pCount.appendInvariantChars(*pluralCount, status);
123 const UChar *pluralName = ucurr_getPluralName(
124 currency, locale, &unusedIsChoice, pCount.data(),
125 &len, &status);
126 fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
127 }
128 delete keywords;
129 }
130
131 void
132 CurrencyAffixInfo::adjustPrecision(
133 const UChar *currency, const UCurrencyUsage usage,
134 FixedPrecision &precision, UErrorCode &status) {
135 if (U_FAILURE(status)) {
136 return;
137 }
138
139 int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
140 currency, usage, &status);
141 precision.fMin.setFracDigitCount(digitCount);
142 precision.fMax.setFracDigitCount(digitCount);
143 double increment = ucurr_getRoundingIncrementForUsage(
144 currency, usage, &status);
145 if (increment == 0.0) {
146 precision.fRoundingIncrement.clear();
147 } else {
148 precision.fRoundingIncrement.set(increment);
149 // guard against round-off error
150 precision.fRoundingIncrement.round(6);
151 }
152 }
153
154 void
155 AffixPattern::addLiteral(
156 const UChar *literal, int32_t start, int32_t len) {
157 char32Count += u_countChar32(literal + start, len);
158 literals.append(literal, start, len);
159 int32_t tlen = tokens.length();
160 // Takes 4 UChars to encode maximum literal length.
161 UChar *tokenChars = tokens.getBuffer(tlen + 4);
162
163 // find start of literal size. May be tlen if there is no literal.
164 // While finding start of literal size, compute literal length
165 int32_t literalLength = 0;
166 int32_t tLiteralStart = tlen;
167 while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
168 tLiteralStart--;
169 literalLength <<= 8;
170 literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
171 }
172 // Add number of chars we just added to literal
173 literalLength += len;
174
175 // Now encode the new length starting at tLiteralStart
176 tlen = tLiteralStart;
177 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
178 literalLength >>= 8;
179 while (literalLength) {
180 tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
181 literalLength >>= 8;
182 }
183 tokens.releaseBuffer(tlen);
184 }
185
186 void
187 AffixPattern::add(ETokenType t) {
188 add(t, 1);
189 }
190
191 void
192 AffixPattern::addCurrency(uint8_t count) {
193 add(kCurrency, count);
194 }
195
196 void
197 AffixPattern::add(ETokenType t, uint8_t count) {
198 U_ASSERT(t != kLiteral);
199 char32Count += count;
200 switch (t) {
201 case kCurrency:
202 hasCurrencyToken = TRUE;
203 break;
204 case kPercent:
205 hasPercentToken = TRUE;
206 break;
207 case kPerMill:
208 hasPermillToken = TRUE;
209 break;
210 default:
211 // Do nothing
212 break;
213 }
214 tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
215 }
216
217 AffixPattern &
218 AffixPattern::append(const AffixPattern &other) {
219 AffixPatternIterator iter;
220 other.iterator(iter);
221 UnicodeString literal;
222 while (iter.nextToken()) {
223 switch (iter.getTokenType()) {
224 case kLiteral:
225 iter.getLiteral(literal);
226 addLiteral(literal.getBuffer(), 0, literal.length());
227 break;
228 case kCurrency:
229 addCurrency(static_cast<uint8_t>(iter.getTokenLength()));
230 break;
231 default:
232 add(iter.getTokenType());
233 break;
234 }
235 }
236 return *this;
237 }
238
239 void
240 AffixPattern::remove() {
241 tokens.remove();
242 literals.remove();
243 hasCurrencyToken = FALSE;
244 hasPercentToken = FALSE;
245 hasPermillToken = FALSE;
246 char32Count = 0;
247 }
248
249 // escapes literals for strings where special characters are NOT escaped
250 // except for apostrophe.
251 static void escapeApostropheInLiteral(
252 const UnicodeString &literal, UnicodeStringAppender &appender) {
253 int32_t len = literal.length();
254 const UChar *buffer = literal.getBuffer();
255 for (int32_t i = 0; i < len; ++i) {
256 UChar ch = buffer[i];
257 switch (ch) {
258 case 0x27:
259 appender.append((UChar) 0x27);
260 appender.append((UChar) 0x27);
261 break;
262 default:
263 appender.append(ch);
264 break;
265 }
266 }
267 }
268
269
270 // escapes literals for user strings where special characters in literals
271 // are escaped with apostrophe.
272 static void escapeLiteral(
273 const UnicodeString &literal, UnicodeStringAppender &appender) {
274 int32_t len = literal.length();
275 const UChar *buffer = literal.getBuffer();
276 for (int32_t i = 0; i < len; ++i) {
277 UChar ch = buffer[i];
278 switch (ch) {
279 case 0x27:
280 appender.append((UChar) 0x27);
281 appender.append((UChar) 0x27);
282 break;
283 case 0x25:
284 appender.append((UChar) 0x27);
285 appender.append((UChar) 0x25);
286 appender.append((UChar) 0x27);
287 break;
288 case 0x2030:
289 appender.append((UChar) 0x27);
290 appender.append((UChar) 0x2030);
291 appender.append((UChar) 0x27);
292 break;
293 case 0xA4:
294 appender.append((UChar) 0x27);
295 appender.append((UChar) 0xA4);
296 appender.append((UChar) 0x27);
297 break;
298 case 0x2D:
299 appender.append((UChar) 0x27);
300 appender.append((UChar) 0x2D);
301 appender.append((UChar) 0x27);
302 break;
303 case 0x2B:
304 appender.append((UChar) 0x27);
305 appender.append((UChar) 0x2B);
306 appender.append((UChar) 0x27);
307 break;
308 default:
309 appender.append(ch);
310 break;
311 }
312 }
313 }
314
315 UnicodeString &
316 AffixPattern::toString(UnicodeString &appendTo) const {
317 AffixPatternIterator iter;
318 iterator(iter);
319 UnicodeStringAppender appender(appendTo);
320 UnicodeString literal;
321 while (iter.nextToken()) {
322 switch (iter.getTokenType()) {
323 case kLiteral:
324 escapeApostropheInLiteral(iter.getLiteral(literal), appender);
325 break;
326 case kPercent:
327 appender.append((UChar) 0x27);
328 appender.append((UChar) 0x25);
329 break;
330 case kPerMill:
331 appender.append((UChar) 0x27);
332 appender.append((UChar) 0x2030);
333 break;
334 case kCurrency:
335 {
336 appender.append((UChar) 0x27);
337 int32_t cl = iter.getTokenLength();
338 for (int32_t i = 0; i < cl; ++i) {
339 appender.append((UChar) 0xA4);
340 }
341 }
342 break;
343 case kNegative:
344 appender.append((UChar) 0x27);
345 appender.append((UChar) 0x2D);
346 break;
347 case kPositive:
348 appender.append((UChar) 0x27);
349 appender.append((UChar) 0x2B);
350 break;
351 default:
352 U_ASSERT(FALSE);
353 break;
354 }
355 }
356 return appendTo;
357 }
358
359 UnicodeString &
360 AffixPattern::toUserString(UnicodeString &appendTo) const {
361 AffixPatternIterator iter;
362 iterator(iter);
363 UnicodeStringAppender appender(appendTo);
364 UnicodeString literal;
365 while (iter.nextToken()) {
366 switch (iter.getTokenType()) {
367 case kLiteral:
368 escapeLiteral(iter.getLiteral(literal), appender);
369 break;
370 case kPercent:
371 appender.append((UChar) 0x25);
372 break;
373 case kPerMill:
374 appender.append((UChar) 0x2030);
375 break;
376 case kCurrency:
377 {
378 int32_t cl = iter.getTokenLength();
379 for (int32_t i = 0; i < cl; ++i) {
380 appender.append((UChar) 0xA4);
381 }
382 }
383 break;
384 case kNegative:
385 appender.append((UChar) 0x2D);
386 break;
387 case kPositive:
388 appender.append((UChar) 0x2B);
389 break;
390 default:
391 U_ASSERT(FALSE);
392 break;
393 }
394 }
395 return appendTo;
396 }
397
398 class AffixPatternAppender : public UMemory {
399 public:
400 AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
401
402 inline void append(UChar x) {
403 if (fIdx == UPRV_LENGTHOF(fBuffer)) {
404 fDest->addLiteral(fBuffer, 0, fIdx);
405 fIdx = 0;
406 }
407 fBuffer[fIdx++] = x;
408 }
409
410 inline void append(UChar32 x) {
411 if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
412 fDest->addLiteral(fBuffer, 0, fIdx);
413 fIdx = 0;
414 }
415 U16_APPEND_UNSAFE(fBuffer, fIdx, x);
416 }
417
418 inline void flush() {
419 if (fIdx) {
420 fDest->addLiteral(fBuffer, 0, fIdx);
421 }
422 fIdx = 0;
423 }
424
425 /**
426 * flush the buffer when we go out of scope.
427 */
428 ~AffixPatternAppender() {
429 flush();
430 }
431 private:
432 AffixPattern *fDest;
433 int32_t fIdx;
434 UChar fBuffer[32];
435 AffixPatternAppender(const AffixPatternAppender &other);
436 AffixPatternAppender &operator=(const AffixPatternAppender &other);
437 };
438
439
440 AffixPattern &
441 AffixPattern::parseUserAffixString(
442 const UnicodeString &affixStr,
443 AffixPattern &appendTo,
444 UErrorCode &status) {
445 if (U_FAILURE(status)) {
446 return appendTo;
447 }
448 int32_t len = affixStr.length();
449 const UChar *buffer = affixStr.getBuffer();
450 // 0 = not quoted; 1 = quoted.
451 int32_t state = 0;
452 AffixPatternAppender appender(appendTo);
453 for (int32_t i = 0; i < len; ) {
454 UChar token;
455 int32_t tokenSize = nextUserToken(buffer, i, len, &token);
456 i += tokenSize;
457 if (token == 0x27 && tokenSize == 1) { // quote
458 state = 1 - state;
459 continue;
460 }
461 if (state == 0) {
462 switch (token) {
463 case 0x25:
464 appender.flush();
465 appendTo.add(kPercent, 1);
466 break;
467 case 0x27: // double quote
468 appender.append((UChar) 0x27);
469 break;
470 case 0x2030:
471 appender.flush();
472 appendTo.add(kPerMill, 1);
473 break;
474 case 0x2D:
475 appender.flush();
476 appendTo.add(kNegative, 1);
477 break;
478 case 0x2B:
479 appender.flush();
480 appendTo.add(kPositive, 1);
481 break;
482 case 0xA4:
483 appender.flush();
484 appendTo.add(kCurrency, static_cast<uint8_t>(tokenSize));
485 break;
486 default:
487 appender.append(token);
488 break;
489 }
490 } else {
491 switch (token) {
492 case 0x27: // double quote
493 appender.append((UChar) 0x27);
494 break;
495 case 0xA4: // included b/c tokenSize can be > 1
496 for (int32_t j = 0; j < tokenSize; ++j) {
497 appender.append((UChar) 0xA4);
498 }
499 break;
500 default:
501 appender.append(token);
502 break;
503 }
504 }
505 }
506 return appendTo;
507 }
508
509 AffixPattern &
510 AffixPattern::parseAffixString(
511 const UnicodeString &affixStr,
512 AffixPattern &appendTo,
513 UErrorCode &status) {
514 if (U_FAILURE(status)) {
515 return appendTo;
516 }
517 int32_t len = affixStr.length();
518 const UChar *buffer = affixStr.getBuffer();
519 for (int32_t i = 0; i < len; ) {
520 UChar token;
521 int32_t tokenSize = nextToken(buffer, i, len, &token);
522 if (tokenSize == 1) {
523 int32_t literalStart = i;
524 ++i;
525 while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
526 ++i;
527 }
528 appendTo.addLiteral(buffer, literalStart, i - literalStart);
529
530 // If we reached end of string, we are done
531 if (i == len) {
532 return appendTo;
533 }
534 }
535 i += tokenSize;
536 switch (token) {
537 case 0x25:
538 appendTo.add(kPercent, 1);
539 break;
540 case 0x2030:
541 appendTo.add(kPerMill, 1);
542 break;
543 case 0x2D:
544 appendTo.add(kNegative, 1);
545 break;
546 case 0x2B:
547 appendTo.add(kPositive, 1);
548 break;
549 case 0xA4:
550 {
551 if (tokenSize - 1 > 3) {
552 status = U_PARSE_ERROR;
553 return appendTo;
554 }
555 appendTo.add(kCurrency, tokenSize - 1);
556 }
557 break;
558 default:
559 appendTo.addLiteral(&token, 0, 1);
560 break;
561 }
562 }
563 return appendTo;
564 }
565
566 AffixPatternIterator &
567 AffixPattern::iterator(AffixPatternIterator &result) const {
568 result.nextLiteralIndex = 0;
569 result.lastLiteralLength = 0;
570 result.nextTokenIndex = 0;
571 result.tokens = &tokens;
572 result.literals = &literals;
573 return result;
574 }
575
576 UBool
577 AffixPatternIterator::nextToken() {
578 int32_t tlen = tokens->length();
579 if (nextTokenIndex == tlen) {
580 return FALSE;
581 }
582 ++nextTokenIndex;
583 const UChar *tokenBuffer = tokens->getBuffer();
584 if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
585 AffixPattern::kLiteral) {
586 while (nextTokenIndex < tlen &&
587 UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
588 ++nextTokenIndex;
589 }
590 lastLiteralLength = 0;
591 int32_t i = nextTokenIndex - 1;
592 for (; UNPACK_LONG(tokenBuffer[i]); --i) {
593 lastLiteralLength <<= 8;
594 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
595 }
596 lastLiteralLength <<= 8;
597 lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
598 nextLiteralIndex += lastLiteralLength;
599 }
600 return TRUE;
601 }
602
603 AffixPattern::ETokenType
604 AffixPatternIterator::getTokenType() const {
605 return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
606 }
607
608 UnicodeString &
609 AffixPatternIterator::getLiteral(UnicodeString &result) const {
610 const UChar *buffer = literals->getBuffer();
611 result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
612 return result;
613 }
614
615 int32_t
616 AffixPatternIterator::getTokenLength() const {
617 const UChar *tokenBuffer = tokens->getBuffer();
618 AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
619 return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
620 }
621
622 AffixPatternParser::AffixPatternParser()
623 : fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
624 }
625
626 AffixPatternParser::AffixPatternParser(
627 const DecimalFormatSymbols &symbols) {
628 setDecimalFormatSymbols(symbols);
629 }
630
631 void
632 AffixPatternParser::setDecimalFormatSymbols(
633 const DecimalFormatSymbols &symbols) {
634 fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
635 fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
636 fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
637 fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
638 }
639
640 PluralAffix &
641 AffixPatternParser::parse(
642 const AffixPattern &affixPattern,
643 const CurrencyAffixInfo &currencyAffixInfo,
644 PluralAffix &appendTo,
645 UErrorCode &status) const {
646 if (U_FAILURE(status)) {
647 return appendTo;
648 }
649 AffixPatternIterator iter;
650 affixPattern.iterator(iter);
651 UnicodeString literal;
652 while (iter.nextToken()) {
653 switch (iter.getTokenType()) {
654 case AffixPattern::kPercent:
655 appendTo.append(fPercent, UNUM_PERCENT_FIELD);
656 break;
657 case AffixPattern::kPerMill:
658 appendTo.append(fPermill, UNUM_PERMILL_FIELD);
659 break;
660 case AffixPattern::kNegative:
661 appendTo.append(fNegative, UNUM_SIGN_FIELD);
662 break;
663 case AffixPattern::kPositive:
664 appendTo.append(fPositive, UNUM_SIGN_FIELD);
665 break;
666 case AffixPattern::kCurrency:
667 switch (iter.getTokenLength()) {
668 case 1:
669 appendTo.append(
670 currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
671 break;
672 case 2:
673 appendTo.append(
674 currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
675 break;
676 case 3:
677 appendTo.append(
678 currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
679 break;
680 default:
681 U_ASSERT(FALSE);
682 break;
683 }
684 break;
685 case AffixPattern::kLiteral:
686 appendTo.append(iter.getLiteral(literal));
687 break;
688 default:
689 U_ASSERT(FALSE);
690 break;
691 }
692 }
693 return appendTo;
694 }
695
696
697 U_NAMESPACE_END
698 #endif /* #if !UCONFIG_NO_FORMATTING */