+// When converting from UTF-16 to UTF-8, the result will have at most 3 times
+// as many bytes as the source has UChars.
+// The "worst cases" are writing systems like Indic, Thai and CJK with
+// 3:1 bytes:UChars.
+void
+UnicodeString::toUTF8(ByteSink &sink) const {
+ int32_t length16 = length();
+ if(length16 != 0) {
+ char stackBuffer[1024];
+ int32_t capacity = (int32_t)sizeof(stackBuffer);
+ UBool utf8IsOwned = FALSE;
+ char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
+ 3*length16,
+ stackBuffer, capacity,
+ &capacity);
+ int32_t length8 = 0;
+ UErrorCode errorCode = U_ZERO_ERROR;
+ u_strToUTF8WithSub(utf8, capacity, &length8,
+ getBuffer(), length16,
+ 0xFFFD, // Standard substitution character.
+ NULL, // Don't care about number of substitutions.
+ &errorCode);
+ if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
+ utf8 = (char *)uprv_malloc(length8);
+ if(utf8 != NULL) {
+ utf8IsOwned = TRUE;
+ errorCode = U_ZERO_ERROR;
+ u_strToUTF8WithSub(utf8, length8, &length8,
+ getBuffer(), length16,
+ 0xFFFD, // Standard substitution character.
+ NULL, // Don't care about number of substitutions.
+ &errorCode);
+ } else {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ }
+ }
+ if(U_SUCCESS(errorCode)) {
+ sink.Append(utf8, length8);
+ sink.Flush();
+ }
+ if(utf8IsOwned) {
+ uprv_free(utf8);
+ }
+ }
+}
+
+int32_t
+UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
+ int32_t length32=0;
+ if(U_SUCCESS(errorCode)) {
+ // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
+ u_strToUTF32WithSub(utf32, capacity, &length32,
+ getBuffer(), length(),
+ 0xfffd, // Substitution character.
+ NULL, // Don't care about number of substitutions.
+ &errorCode);
+ }
+ return length32;
+}
+