1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
4 *******************************************************************************
6 * Copyright (C) 1998-2016, International Business Machines
7 * Corporation and others. All Rights Reserved.
9 *******************************************************************************
13 * Modification History:
15 * Date Name Description
16 * 05/10/01 Ram Creation.
17 *******************************************************************************
20 #include "unicode/utypes.h"
21 #include "unicode/putil.h"
22 #include "unicode/uchar.h"
23 #include "unicode/ucnv.h"
24 #include "unicode/ucnv_err.h"
25 #include "unicode/ustring.h"
26 #include "unicode/utf16.h"
34 #if !UCONFIG_NO_CONVERSION
37 #define MAX_IN_BUF 1000
38 #define MAX_U_BUF 1500
39 #define CONTEXT_LEN 20
47 int32_t signatureLength
;
50 UBool showWarning
; /* makes this API not produce any errors */
54 U_CAPI UBool U_EXPORT2
55 ucbuf_autodetect_fs(FileStream
* in
, const char** cp
, UConverter
** conv
, int32_t* signatureLength
, UErrorCode
* error
){
59 UChar target
[1]={ 0 };
63 /* read a few bytes */
64 numRead
=T_FileStream_read(in
, start
, sizeof(start
));
66 *cp
= ucnv_detectUnicodeSignature(start
, numRead
, signatureLength
, error
);
68 /* unread the bytes beyond what was consumed for U+FEFF */
69 T_FileStream_rewind(in
);
70 if (*signatureLength
> 0) {
71 T_FileStream_read(in
, start
, *signatureLength
);
79 /* open the converter for the detected Unicode charset */
80 *conv
= ucnv_open(*cp
,error
);
82 /* convert and ignore initial U+FEFF, and the buffer overflow */
85 ucnv_toUnicode(*conv
, &pTarget
, target
+1, &pStart
, start
+*signatureLength
, NULL
, FALSE
, error
);
86 *signatureLength
= (int32_t)(pStart
- start
);
87 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
91 /* verify that we successfully read exactly U+FEFF */
92 if(U_SUCCESS(*error
) && (pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
93 *error
=U_INTERNAL_PROGRAM_ERROR
;
99 static UBool
ucbuf_isCPKnown(const char* cp
){
100 if(ucnv_compareNames("UTF-8",cp
)==0){
103 if(ucnv_compareNames("UTF-16BE",cp
)==0){
106 if(ucnv_compareNames("UTF-16LE",cp
)==0){
109 if(ucnv_compareNames("UTF-16",cp
)==0){
112 if(ucnv_compareNames("UTF-32",cp
)==0){
115 if(ucnv_compareNames("UTF-32BE",cp
)==0){
118 if(ucnv_compareNames("UTF-32LE",cp
)==0){
121 if(ucnv_compareNames("SCSU",cp
)==0){
124 if(ucnv_compareNames("BOCU-1",cp
)==0){
127 if(ucnv_compareNames("UTF-7",cp
)==0){
133 U_CAPI FileStream
* U_EXPORT2
134 ucbuf_autodetect(const char* fileName
, const char** cp
,UConverter
** conv
, int32_t* signatureLength
,UErrorCode
* error
){
136 if(error
==NULL
|| U_FAILURE(*error
)){
139 if(conv
==NULL
|| cp
==NULL
|| fileName
==NULL
){
140 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
144 in
= T_FileStream_open(fileName
,"rb");
147 *error
=U_FILE_ACCESS_ERROR
;
151 if(ucbuf_autodetect_fs(in
,cp
,conv
,signatureLength
,error
)) {
156 T_FileStream_close(in
);
161 /* fill the uchar buffer */
163 ucbuf_fillucbuf( UCHARBUF
* buf
,UErrorCode
* error
){
166 const char* source
=NULL
;
167 char carr
[MAX_IN_BUF
] = {'\0'};
170 int32_t outputWritten
=0;
172 const char* sourceLimit
=NULL
;
174 pTarget
= buf
->buffer
;
175 /* check if we arrived here without exhausting the buffer*/
176 if(buf
->currentPos
<buf
->bufLimit
){
177 offset
= (int32_t)(buf
->bufLimit
-buf
->currentPos
);
178 memmove(buf
->buffer
,buf
->currentPos
,offset
* sizeof(UChar
));
182 memset(pTarget
+offset
,0xff,sizeof(UChar
)*(MAX_IN_BUF
-offset
));
185 cbufSize
= MAX_IN_BUF
;
187 inputRead
=T_FileStream_read(buf
->in
,cbuf
,cbufSize
-offset
);
188 buf
->remaining
-=inputRead
;
191 cbufSize
= T_FileStream_size(buf
->in
);
192 cbuf
= (char*)uprv_malloc(cbufSize
);
194 *error
= U_MEMORY_ALLOCATION_ERROR
;
197 inputRead
= T_FileStream_read(buf
->in
,cbuf
,cbufSize
);
198 buf
->remaining
-=inputRead
;
201 /* just to be sure...*/
202 if ( 0 == inputRead
)
206 /* convert the bytes */
208 /* set the callback to stop */
209 UConverterToUCallback toUOldAction
;
211 void* toUNewContext
=NULL
;
212 ucnv_setToUCallBack(buf
->conv
,
213 UCNV_TO_U_CALLBACK_STOP
,
216 (const void**)&toUOldContext
,
218 /* since state is saved in the converter we add offset to source*/
219 target
= pTarget
+offset
;
221 sourceLimit
= source
+ inputRead
;
222 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
223 &source
,sourceLimit
,NULL
,
224 (UBool
)(buf
->remaining
==0),error
);
226 if(U_FAILURE(*error
)){
227 char context
[CONTEXT_LEN
+1];
228 char preContext
[CONTEXT_LEN
+1];
229 char postContext
[CONTEXT_LEN
+1];
230 int8_t len
= CONTEXT_LEN
;
234 /* use erro1 to preserve the error code */
235 UErrorCode error1
=U_ZERO_ERROR
;
237 if( buf
->showWarning
==TRUE
){
238 fprintf(stderr
,"\n###WARNING: Encountered abnormal bytes while"
239 " converting input stream to target encoding: %s\n",
240 u_errorName(*error
));
244 /* now get the context chars */
245 ucnv_getInvalidChars(buf
->conv
,context
,&len
,&error1
);
246 context
[len
]= 0 ; /* null terminate the buffer */
248 pos
= (int32_t)(source
- cbuf
- len
);
250 /* for pre-context */
251 start
= (pos
<=CONTEXT_LEN
)? 0 : (pos
- (CONTEXT_LEN
-1));
254 memcpy(preContext
,cbuf
+start
,stop
-start
);
255 /* null terminate the buffer */
256 preContext
[stop
-start
] = 0;
258 /* for post-context */
260 stop
= (int32_t)(((pos
+CONTEXT_LEN
)<= (sourceLimit
-cbuf
) )? (pos
+(CONTEXT_LEN
-1)) : (sourceLimit
-cbuf
));
262 memcpy(postContext
,source
,stop
-start
);
263 /* null terminate the buffer */
264 postContext
[stop
-start
] = 0;
266 if(buf
->showWarning
==TRUE
){
267 /* print out the context */
268 fprintf(stderr
,"\tPre-context: %s\n",preContext
);
269 fprintf(stderr
,"\tContext: %s\n",context
);
270 fprintf(stderr
,"\tPost-context: %s\n", postContext
);
273 /* reset the converter */
274 ucnv_reset(buf
->conv
);
276 /* set the call back to substitute
277 * and restart conversion
279 ucnv_setToUCallBack(buf
->conv
,
280 UCNV_TO_U_CALLBACK_SUBSTITUTE
,
283 (const void**)&toUOldContext
,
286 /* reset source and target start positions */
287 target
= pTarget
+offset
;
291 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
292 &source
,sourceLimit
,NULL
,
293 (UBool
)(buf
->remaining
==0),&error1
);
296 outputWritten
= (int32_t)(target
- pTarget
);
302 for(i
=0;i
<numRead
;i
++){
303 /* printf("%c", (char)(*target++));*/
309 u_charsToUChars(cbuf
,target
+offset
,inputRead
);
310 outputWritten
=((buf
->remaining
>cbufSize
)? cbufSize
:inputRead
+offset
);
312 buf
->currentPos
= pTarget
;
313 buf
->bufLimit
=pTarget
+outputWritten
;
314 *buf
->bufLimit
=0; /*NUL terminate*/
323 /* get a UChar from the stream*/
324 U_CAPI
int32_t U_EXPORT2
325 ucbuf_getc(UCHARBUF
* buf
,UErrorCode
* error
){
326 if(error
==NULL
|| U_FAILURE(*error
)){
329 if(buf
->currentPos
>=buf
->bufLimit
){
330 if(buf
->remaining
==0){
333 buf
=ucbuf_fillucbuf(buf
,error
);
334 if(U_FAILURE(*error
)){
339 return *(buf
->currentPos
++);
342 /* get a UChar32 from the stream*/
343 U_CAPI
int32_t U_EXPORT2
344 ucbuf_getc32(UCHARBUF
* buf
,UErrorCode
* error
){
345 int32_t retVal
= (int32_t)U_EOF
;
346 if(error
==NULL
|| U_FAILURE(*error
)){
349 if(buf
->currentPos
+1>=buf
->bufLimit
){
350 if(buf
->remaining
==0){
353 buf
=ucbuf_fillucbuf(buf
,error
);
354 if(U_FAILURE(*error
)){
358 if(U16_IS_LEAD(*(buf
->currentPos
))){
359 retVal
=U16_GET_SUPPLEMENTARY(buf
->currentPos
[0],buf
->currentPos
[1]);
362 retVal
= *(buf
->currentPos
++);
367 /* u_unescapeAt() callback to return a UChar*/
368 static UChar U_CALLCONV
369 _charAt(int32_t offset
, void *context
) {
370 return ((UCHARBUF
*) context
)->currentPos
[offset
];
373 /* getc and escape it */
374 U_CAPI
int32_t U_EXPORT2
375 ucbuf_getcx32(UCHARBUF
* buf
,UErrorCode
* error
) {
379 if(error
==NULL
|| U_FAILURE(*error
)){
382 /* Fill the buffer if it is empty */
383 if (buf
->currentPos
>=buf
->bufLimit
-2) {
384 ucbuf_fillucbuf(buf
,error
);
387 /* Get the next character in the buffer */
388 if (buf
->currentPos
< buf
->bufLimit
) {
389 c1
= *(buf
->currentPos
)++;
394 c2
= *(buf
->currentPos
);
396 /* If it isn't a backslash, return it */
401 /* Determine the amount of data in the buffer */
402 length
= (int32_t)(buf
->bufLimit
- buf
->currentPos
);
404 /* The longest escape sequence is \Uhhhhhhhh; make sure
405 we have at least that many characters */
408 /* fill the buffer */
409 ucbuf_fillucbuf(buf
,error
);
410 length
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
413 /* Process the escape */
415 c32
= u_unescapeAt(_charAt
, &offset
, length
, (void*)buf
);
417 /* check if u_unescapeAt unescaped and converted
420 if(c32
==(UChar32
)0xFFFFFFFF){
421 if(buf
->showWarning
) {
422 char context
[CONTEXT_LEN
+1];
423 int32_t len
= CONTEXT_LEN
;
427 context
[len
]= 0 ; /* null terminate the buffer */
428 u_UCharsToChars( buf
->currentPos
, context
, len
);
429 fprintf(stderr
,"Bad escape: [%c%s]...\n", (int)c1
, context
);
431 *error
= U_ILLEGAL_ESCAPE_SEQUENCE
;
433 }else if(c32
!=c2
|| (c32
==0x0075 && c2
==0x0075 && c1
==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){
434 /* Update the current buffer position */
435 buf
->currentPos
+= offset
;
437 /* unescaping failed so we just return
438 * c1 and not consume the buffer
439 * this is useful for rules with escapes
449 U_CAPI UCHARBUF
* U_EXPORT2
450 ucbuf_open(const char* fileName
,const char** cp
,UBool showWarning
, UBool buffered
, UErrorCode
* error
){
452 FileStream
* in
= NULL
;
455 if(error
==NULL
|| U_FAILURE(*error
)){
458 if(cp
==NULL
|| fileName
==NULL
){
459 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
462 if (!uprv_strcmp(fileName
, "-")) {
463 in
= T_FileStream_stdin();
465 in
= T_FileStream_open(fileName
, "rb");
469 UCHARBUF
* buf
=(UCHARBUF
*) uprv_malloc(sizeof(UCHARBUF
));
470 fileSize
= T_FileStream_size(in
);
472 *error
= U_MEMORY_ALLOCATION_ERROR
;
473 T_FileStream_close(in
);
478 buf
->showWarning
= showWarning
;
479 buf
->isBuffered
= buffered
;
480 buf
->signatureLength
=0;
481 if(*cp
==NULL
|| **cp
=='\0'){
482 /* don't have code page name... try to autodetect */
483 ucbuf_autodetect_fs(in
,cp
,&buf
->conv
,&buf
->signatureLength
,error
);
484 }else if(ucbuf_isCPKnown(*cp
)){
486 ucbuf_autodetect_fs(in
,&knownCp
,&buf
->conv
,&buf
->signatureLength
,error
);
488 if(U_SUCCESS(*error
) && buf
->conv
==NULL
) {
489 buf
->conv
=ucnv_open(*cp
,error
);
491 if(U_FAILURE(*error
)){
492 ucnv_close(buf
->conv
);
494 T_FileStream_close(in
);
498 if((buf
->conv
==NULL
) && (buf
->showWarning
==TRUE
)){
499 fprintf(stderr
,"###WARNING: No converter defined. Using codepage of system.\n");
501 buf
->remaining
=fileSize
-buf
->signatureLength
;
503 buf
->bufCapacity
=MAX_U_BUF
;
505 buf
->bufCapacity
=buf
->remaining
+buf
->signatureLength
+1/*for terminating nul*/;
507 buf
->buffer
=(UChar
*) uprv_malloc(U_SIZEOF_UCHAR
* buf
->bufCapacity
);
508 if (buf
->buffer
== NULL
) {
509 *error
= U_MEMORY_ALLOCATION_ERROR
;
513 buf
->currentPos
=buf
->buffer
;
514 buf
->bufLimit
=buf
->buffer
;
515 if(U_FAILURE(*error
)){
516 fprintf(stderr
, "Could not open codepage [%s]: %s\n", *cp
, u_errorName(*error
));
520 ucbuf_fillucbuf(buf
,error
);
521 if(U_FAILURE(*error
)){
527 *error
=U_FILE_ACCESS_ERROR
;
533 /* TODO: this method will fail if at the
534 * begining of buffer and the uchar to unget
535 * is from the previous buffer. Need to implement
536 * system to take care of that situation.
538 U_CAPI
void U_EXPORT2
539 ucbuf_ungetc(int32_t c
,UCHARBUF
* buf
){
540 /* decrement currentPos pointer
541 * if not at the begining of buffer
543 if(buf
->currentPos
!=buf
->buffer
){
544 if(*(buf
->currentPos
-1)==c
){
547 /* ungetc failed - did not match. */
550 /* ungetc failed - beginning of buffer. */
554 /* frees the resources of UChar* buffer */
556 ucbuf_closebuf(UCHARBUF
* buf
){
557 uprv_free(buf
->buffer
);
561 /* close the buf and release resources*/
562 U_CAPI
void U_EXPORT2
563 ucbuf_close(UCHARBUF
* buf
){
566 ucnv_close(buf
->conv
);
568 T_FileStream_close(buf
->in
);
574 /* rewind the buf and file stream */
575 U_CAPI
void U_EXPORT2
576 ucbuf_rewind(UCHARBUF
* buf
,UErrorCode
* error
){
577 if(error
==NULL
|| U_FAILURE(*error
)){
581 buf
->currentPos
=buf
->buffer
;
582 buf
->bufLimit
=buf
->buffer
;
583 T_FileStream_rewind(buf
->in
);
584 buf
->remaining
=T_FileStream_size(buf
->in
)-buf
->signatureLength
;
586 ucnv_resetToUnicode(buf
->conv
);
587 if(buf
->signatureLength
>0) {
588 UChar target
[1]={ 0 };
594 /* read the signature bytes */
595 numRead
=T_FileStream_read(buf
->in
, start
, buf
->signatureLength
);
597 /* convert and ignore initial U+FEFF, and the buffer overflow */
600 ucnv_toUnicode(buf
->conv
, &pTarget
, target
+1, &pStart
, start
+numRead
, NULL
, FALSE
, error
);
601 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
605 /* verify that we successfully read exactly U+FEFF */
606 if(U_SUCCESS(*error
) && (numRead
!=buf
->signatureLength
|| pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
607 *error
=U_INTERNAL_PROGRAM_ERROR
;
614 U_CAPI
int32_t U_EXPORT2
615 ucbuf_size(UCHARBUF
* buf
){
618 return (T_FileStream_size(buf
->in
)-buf
->signatureLength
)/ucnv_getMinCharSize(buf
->conv
);
620 return (int32_t)(buf
->bufLimit
- buf
->buffer
);
626 U_CAPI
const UChar
* U_EXPORT2
627 ucbuf_getBuffer(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* error
){
628 if(error
==NULL
|| U_FAILURE(*error
)){
631 if(buf
==NULL
|| len
==NULL
){
632 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
635 *len
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
639 U_CAPI
const char* U_EXPORT2
640 ucbuf_resolveFileName(const char* inputDir
, const char* fileName
, char* target
, int32_t* len
, UErrorCode
* status
){
641 int32_t requiredLen
= 0;
644 if(status
==NULL
|| U_FAILURE(*status
)){
648 if(inputDir
== NULL
|| fileName
== NULL
|| len
==NULL
|| (target
==NULL
&& *len
>0)){
649 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
654 dirlen
= (int32_t)uprv_strlen(inputDir
);
655 filelen
= (int32_t)uprv_strlen(fileName
);
656 if(inputDir
[dirlen
-1] != U_FILE_SEP_CHAR
) {
657 requiredLen
= dirlen
+ filelen
+ 2;
658 if((*len
< requiredLen
) || target
==NULL
){
660 *status
= U_BUFFER_OVERFLOW_ERROR
;
666 * append the input dir to openFileName if the first char in
667 * filename is not file seperation char and the last char input directory is not '.'.
668 * This is to support :
669 * genrb -s. /home/icu/data
671 * The user cannot mix notations like
672 * genrb -s. /icu/data --- the absolute path specified. -s redundant
674 * genrb -s. icu/data --- start from CWD and look in icu/data dir
676 if( (fileName
[0] != U_FILE_SEP_CHAR
) && (inputDir
[dirlen
-1] !='.')){
677 uprv_strcpy(target
, inputDir
);
678 target
[dirlen
] = U_FILE_SEP_CHAR
;
680 target
[dirlen
+ 1] = '\0';
682 requiredLen
= dirlen
+ filelen
+ 1;
683 if((*len
< requiredLen
) || target
==NULL
){
685 *status
= U_BUFFER_OVERFLOW_ERROR
;
689 uprv_strcpy(target
, inputDir
);
692 uprv_strcat(target
, fileName
);
696 * Unicode TR 13 says any of the below chars is
697 * a new line char in a readline function in addition
698 * to CR+LF combination which needs to be
701 static UBool
ucbuf_isCharNewLine(UChar c
){
703 case 0x000A: /* LF */
704 case 0x000D: /* CR */
705 case 0x000C: /* FF */
706 case 0x0085: /* NEL */
707 case 0x2028: /* LS */
708 case 0x2029: /* PS */
715 U_CAPI
const UChar
* U_EXPORT2
716 ucbuf_readline(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* err
){
717 UChar
* temp
= buf
->currentPos
;
718 UChar
* savePos
=NULL
;
721 /* The input is buffered we have to do more
722 * for returning a pointer U_TRUNCATED_CHAR_FOUND
726 if(buf
->remaining
==0){
727 return NULL
; /* end of file is reached return NULL */
729 if(temp
>=buf
->bufLimit
&& buf
->currentPos
== buf
->buffer
){
730 *err
= U_TRUNCATED_CHAR_FOUND
;
733 ucbuf_fillucbuf(buf
,err
);
739 * Accoding to TR 13 readLine functions must interpret
740 * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators
743 if(c
==0x0d && temp
<= buf
->bufLimit
&& *temp
== 0x0a ){
744 *len
= (int32_t)(temp
++ - buf
->currentPos
);
745 savePos
= buf
->currentPos
;
746 buf
->currentPos
= temp
;
751 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)){ /* Unipad inserts 2028 line separators! */
752 *len
= (int32_t)(temp
- buf
->currentPos
);
753 savePos
= buf
->currentPos
;
754 buf
->currentPos
= temp
;
759 /* we know that all input is read into the internal
760 * buffer so we can safely return pointers
765 if(buf
->currentPos
==buf
->bufLimit
){
766 return NULL
; /* end of file is reached return NULL */
769 if(c
==0x0d && temp
<= buf
->bufLimit
&& *temp
== 0x0a ){
770 *len
= (int32_t)(temp
++ - buf
->currentPos
);
771 savePos
= buf
->currentPos
;
772 buf
->currentPos
= temp
;
776 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)) { /* Unipad inserts 2028 line separators! */
777 *len
= (int32_t)(temp
- buf
->currentPos
);
778 savePos
= buf
->currentPos
;
779 buf
->currentPos
= temp
;
785 /* A compiler warning will appear if all paths don't contain a return statement. */