2 *******************************************************************************
4 * Copyright (C) 1998-2006, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 05/10/01 Ram Creation.
15 *******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/putil.h"
20 #include "unicode/ucnv.h"
21 #include "unicode/ucnv_err.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
31 #if !UCONFIG_NO_CONVERSION
34 #define MAX_IN_BUF 1000
35 #define MAX_U_BUF 1500
36 #define CONTEXT_LEN 15
44 int32_t signatureLength
;
47 UBool showWarning
; /* makes this API not produce any errors */
51 U_CAPI UBool U_EXPORT2
52 ucbuf_autodetect_fs(FileStream
* in
, const char** cp
, UConverter
** conv
, int32_t* signatureLength
, UErrorCode
* error
){
56 UChar target
[1]={ 0 };
60 /* read a few bytes */
61 numRead
=T_FileStream_read(in
, start
, sizeof(start
));
63 *cp
= ucnv_detectUnicodeSignature(start
, numRead
, signatureLength
, error
);
65 /* unread the bytes beyond what was consumed for U+FEFF */
66 T_FileStream_rewind(in
);
67 if (*signatureLength
> 0) {
68 numRead
= T_FileStream_read(in
, start
, *signatureLength
);
76 /* open the converter for the detected Unicode charset */
77 *conv
= ucnv_open(*cp
,error
);
79 /* convert and ignore initial U+FEFF, and the buffer overflow */
82 ucnv_toUnicode(*conv
, &pTarget
, target
+1, &pStart
, start
+*signatureLength
, NULL
, FALSE
, error
);
83 *signatureLength
= (int32_t)(pStart
- start
);
84 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
88 /* verify that we successfully read exactly U+FEFF */
89 if(U_SUCCESS(*error
) && (pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
90 *error
=U_INTERNAL_PROGRAM_ERROR
;
96 static UBool
ucbuf_isCPKnown(const char* cp
){
97 if(ucnv_compareNames("UTF-8",cp
)==0){
100 if(ucnv_compareNames("UTF-16BE",cp
)==0){
103 if(ucnv_compareNames("UTF-16LE",cp
)==0){
106 if(ucnv_compareNames("UTF-16",cp
)==0){
109 if(ucnv_compareNames("UTF-32",cp
)==0){
112 if(ucnv_compareNames("UTF-32BE",cp
)==0){
115 if(ucnv_compareNames("UTF-32LE",cp
)==0){
118 if(ucnv_compareNames("SCSU",cp
)==0){
121 if(ucnv_compareNames("BOCU-1",cp
)==0){
124 if(ucnv_compareNames("UTF-7",cp
)==0){
130 U_CAPI FileStream
* U_EXPORT2
131 ucbuf_autodetect(const char* fileName
, const char** cp
,UConverter
** conv
, int32_t* signatureLength
,UErrorCode
* error
){
133 if(error
==NULL
|| U_FAILURE(*error
)){
136 if(conv
==NULL
|| cp
==NULL
|| fileName
==NULL
){
137 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
141 in
= T_FileStream_open(fileName
,"rb");
144 *error
=U_FILE_ACCESS_ERROR
;
148 if(ucbuf_autodetect_fs(in
,cp
,conv
,signatureLength
,error
)) {
153 T_FileStream_close(in
);
158 /* fill the uchar buffer */
160 ucbuf_fillucbuf( UCHARBUF
* buf
,UErrorCode
* error
){
163 const char* source
=NULL
;
164 char carr
[MAX_IN_BUF
] = {'\0'};
167 int32_t outputWritten
=0;
169 const char* sourceLimit
=NULL
;
171 pTarget
= buf
->buffer
;
172 /* check if we arrived here without exhausting the buffer*/
173 if(buf
->currentPos
<buf
->bufLimit
){
174 offset
= (int32_t)(buf
->bufLimit
-buf
->currentPos
);
175 memmove(buf
->buffer
,buf
->currentPos
,offset
* sizeof(UChar
));
179 memset(pTarget
+offset
,0xff,sizeof(UChar
)*(MAX_IN_BUF
-offset
));
182 cbufSize
= MAX_IN_BUF
;
184 inputRead
=T_FileStream_read(buf
->in
,cbuf
,cbufSize
-offset
);
185 buf
->remaining
-=inputRead
;
188 cbufSize
= T_FileStream_size(buf
->in
);
189 cbuf
= (char*)uprv_malloc(cbufSize
);
190 inputRead
= T_FileStream_read(buf
->in
,cbuf
,cbufSize
);
191 buf
->remaining
-=inputRead
;
194 /* just to be sure...*/
195 if ( 0 == inputRead
)
199 /* convert the bytes */
201 /* set the callback to stop */
202 UConverterToUCallback toUOldAction
;
204 void* toUNewContext
=NULL
;
205 ucnv_setToUCallBack(buf
->conv
,
206 UCNV_TO_U_CALLBACK_STOP
,
209 (const void**)&toUOldContext
,
211 /* since state is saved in the converter we add offset to source*/
212 target
= pTarget
+offset
;
214 sourceLimit
= source
+ inputRead
;
215 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
216 &source
,sourceLimit
,NULL
,
217 (UBool
)(buf
->remaining
==0),error
);
219 if(U_FAILURE(*error
)){
220 char context
[CONTEXT_LEN
];
221 char preContext
[CONTEXT_LEN
];
222 char postContext
[CONTEXT_LEN
];
223 int8_t len
= CONTEXT_LEN
;
227 /* use erro1 to preserve the error code */
228 UErrorCode error1
=U_ZERO_ERROR
;
230 if( buf
->showWarning
==TRUE
){
231 fprintf(stderr
,"\n###WARNING: Encountered abnormal bytes while"
232 " converting input stream to target encoding: %s\n",
233 u_errorName(*error
));
237 /* now get the context chars */
238 ucnv_getInvalidChars(buf
->conv
,context
,&len
,&error1
);
239 context
[len
]= 0 ; /* null terminate the buffer */
241 pos
= (int32_t)(source
- cbuf
- len
);
243 /* for pre-context */
244 start
= (pos
<=CONTEXT_LEN
)? 0 : (pos
- (CONTEXT_LEN
-1));
247 memcpy(preContext
,cbuf
+start
,stop
-start
);
248 /* null terminate the buffer */
249 preContext
[stop
-start
] = 0;
251 /* for post-context */
253 stop
= (int32_t)(((pos
+CONTEXT_LEN
)<= (sourceLimit
-cbuf
) )? (pos
+(CONTEXT_LEN
-1)) : (sourceLimit
-cbuf
));
255 memcpy(postContext
,source
,stop
-start
);
256 /* null terminate the buffer */
257 postContext
[stop
-start
] = 0;
259 if(buf
->showWarning
==TRUE
){
260 /* print out the context */
261 fprintf(stderr
,"\tPre-context: %s\n",preContext
);
262 fprintf(stderr
,"\tContext: %s\n",context
);
263 fprintf(stderr
,"\tPost-context: %s\n", postContext
);
266 /* reset the converter */
267 ucnv_reset(buf
->conv
);
269 /* set the call back to substitute
270 * and restart conversion
272 ucnv_setToUCallBack(buf
->conv
,
273 UCNV_TO_U_CALLBACK_SUBSTITUTE
,
276 (const void**)&toUOldContext
,
279 /* reset source and target start positions */
280 target
= pTarget
+offset
;
284 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
285 &source
,sourceLimit
,NULL
,
286 (UBool
)(buf
->remaining
==0),&error1
);
289 outputWritten
= (int32_t)(target
- pTarget
);
296 for(i
=0;i
<numRead
;i
++){
297 /* printf("%c", (char)(*target++));*/
303 u_charsToUChars(cbuf
,target
+offset
,inputRead
);
304 outputWritten
=((buf
->remaining
>cbufSize
)? cbufSize
:inputRead
+offset
);
306 buf
->currentPos
= pTarget
;
307 buf
->bufLimit
=pTarget
+outputWritten
;
308 *buf
->bufLimit
=0; /*NUL terminate*/
317 /* get a UChar from the stream*/
318 U_CAPI
int32_t U_EXPORT2
319 ucbuf_getc(UCHARBUF
* buf
,UErrorCode
* error
){
320 if(error
==NULL
|| U_FAILURE(*error
)){
323 if(buf
->currentPos
>=buf
->bufLimit
){
324 if(buf
->remaining
==0){
327 buf
=ucbuf_fillucbuf(buf
,error
);
328 if(U_FAILURE(*error
)){
333 return *(buf
->currentPos
++);
336 /* get a UChar32 from the stream*/
337 U_CAPI
int32_t U_EXPORT2
338 ucbuf_getc32(UCHARBUF
* buf
,UErrorCode
* error
){
339 int32_t retVal
= (int32_t)U_EOF
;
340 if(error
==NULL
|| U_FAILURE(*error
)){
343 if(buf
->currentPos
+1>=buf
->bufLimit
){
344 if(buf
->remaining
==0){
347 buf
=ucbuf_fillucbuf(buf
,error
);
348 if(U_FAILURE(*error
)){
352 if(UTF_IS_LEAD(*(buf
->currentPos
))){
353 retVal
=UTF16_GET_PAIR_VALUE(*(buf
->currentPos
++),*(buf
->currentPos
++));
355 retVal
= *(buf
->currentPos
++);
360 /* u_unescapeAt() callback to return a UChar*/
361 static UChar U_CALLCONV
362 _charAt(int32_t offset
, void *context
) {
363 return ((UCHARBUF
*) context
)->currentPos
[offset
];
366 /* getc and escape it */
367 U_CAPI
int32_t U_EXPORT2
368 ucbuf_getcx32(UCHARBUF
* buf
,UErrorCode
* error
) {
372 if(error
==NULL
|| U_FAILURE(*error
)){
375 /* Fill the buffer if it is empty */
376 if (buf
->currentPos
>=buf
->bufLimit
-2) {
377 ucbuf_fillucbuf(buf
,error
);
380 /* Get the next character in the buffer */
381 if (buf
->currentPos
< buf
->bufLimit
) {
382 c1
= *(buf
->currentPos
)++;
387 c2
= *(buf
->currentPos
);
389 /* If it isn't a backslash, return it */
394 /* Determine the amount of data in the buffer */
395 length
= (int32_t)(buf
->bufLimit
- buf
->currentPos
);
397 /* The longest escape sequence is \Uhhhhhhhh; make sure
398 we have at least that many characters */
401 /* fill the buffer */
402 ucbuf_fillucbuf(buf
,error
);
403 length
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
406 /* Process the escape */
408 c32
= u_unescapeAt(_charAt
, &offset
, length
, (void*)buf
);
410 /* check if u_unescapeAt unescaped and converted
414 if(buf
->showWarning
) {
420 context
[len
]= 0 ; /* null terminate the buffer */
421 u_UCharsToChars( buf
->currentPos
, context
, len
);
422 fprintf(stderr
,"Bad escape: [%c%s]...\n", (int)c1
, context
);
424 *error
= U_ILLEGAL_ESCAPE_SEQUENCE
;
426 }else if(c32
!=c2
|| (c32
==0x0075 && c2
==0x0075 && c1
==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){
427 /* Update the current buffer position */
428 buf
->currentPos
+= offset
;
430 /* unescaping failed so we just return
431 * c1 and not consume the buffer
432 * this is useful for rules with escapes
442 U_CAPI UCHARBUF
* U_EXPORT2
443 ucbuf_open(const char* fileName
,const char** cp
,UBool showWarning
, UBool buffered
, UErrorCode
* error
){
445 FileStream
* in
= NULL
;
448 if(error
==NULL
|| U_FAILURE(*error
)){
451 if(cp
==NULL
|| fileName
==NULL
){
452 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
455 if (!uprv_strcmp(fileName
, "-")) {
456 in
= T_FileStream_stdin();
458 in
= T_FileStream_open(fileName
, "rb");
462 UCHARBUF
* buf
=(UCHARBUF
*) uprv_malloc(sizeof(UCHARBUF
));
463 fileSize
= T_FileStream_size(in
);
465 *error
= U_MEMORY_ALLOCATION_ERROR
;
466 T_FileStream_close(in
);
471 buf
->showWarning
= showWarning
;
472 buf
->isBuffered
= buffered
;
473 buf
->signatureLength
=0;
474 if(*cp
==NULL
|| **cp
=='\0'){
475 /* don't have code page name... try to autodetect */
476 ucbuf_autodetect_fs(in
,cp
,&buf
->conv
,&buf
->signatureLength
,error
);
477 }else if(ucbuf_isCPKnown(*cp
)){
479 ucbuf_autodetect_fs(in
,&knownCp
,&buf
->conv
,&buf
->signatureLength
,error
);
481 if(U_SUCCESS(*error
) && buf
->conv
==NULL
) {
482 buf
->conv
=ucnv_open(*cp
,error
);
484 if(U_FAILURE(*error
)){
485 ucnv_close(buf
->conv
);
487 T_FileStream_close(in
);
491 if((buf
->conv
==NULL
) && (buf
->showWarning
==TRUE
)){
492 fprintf(stderr
,"###WARNING: No converter defined. Using codepage of system.\n");
494 buf
->remaining
=fileSize
-buf
->signatureLength
;
496 buf
->bufCapacity
=MAX_U_BUF
;
498 buf
->bufCapacity
=buf
->remaining
+buf
->signatureLength
+1/*for terminating nul*/;
500 buf
->buffer
=(UChar
*) uprv_malloc(U_SIZEOF_UCHAR
* buf
->bufCapacity
);
501 if (buf
->buffer
== NULL
) {
502 *error
= U_MEMORY_ALLOCATION_ERROR
;
503 ucnv_close(buf
->conv
);
505 T_FileStream_close(in
);
508 buf
->currentPos
=buf
->buffer
;
509 buf
->bufLimit
=buf
->buffer
;
510 if(U_FAILURE(*error
)){
511 fprintf(stderr
, "Could not open codepage [%s]: %s\n", *cp
, u_errorName(*error
));
512 ucnv_close(buf
->conv
);
514 T_FileStream_close(in
);
517 buf
=ucbuf_fillucbuf(buf
,error
);
520 *error
=U_FILE_ACCESS_ERROR
;
526 /* TODO: this method will fail if at the
527 * begining of buffer and the uchar to unget
528 * is from the previous buffer. Need to implement
529 * system to take care of that situation.
531 U_CAPI
void U_EXPORT2
532 ucbuf_ungetc(int32_t c
,UCHARBUF
* buf
){
533 /* decrement currentPos pointer
534 * if not at the begining of buffer
536 UChar escaped
[8] ={'\0'};
539 len
= uprv_itou(escaped
,8,c
,16,8);
541 len
=uprv_itou(escaped
,8,c
,16,4);
543 if(buf
->currentPos
!=buf
->buffer
){
544 if(*(buf
->currentPos
-1)==c
){
546 }else if(u_strncmp(buf
->currentPos
-len
,escaped
,len
) == 0){
554 /* frees the resources of UChar* buffer */
556 ucbuf_closebuf(UCHARBUF
* buf
){
557 uprv_free(buf
->buffer
);
561 /* close the buf and release resources*/
562 U_CAPI
void U_EXPORT2
563 ucbuf_close(UCHARBUF
* buf
){
566 ucnv_close(buf
->conv
);
568 T_FileStream_close(buf
->in
);
574 /* rewind the buf and file stream */
575 U_CAPI
void U_EXPORT2
576 ucbuf_rewind(UCHARBUF
* buf
,UErrorCode
* error
){
577 if(error
==NULL
|| U_FAILURE(*error
)){
581 buf
->currentPos
=buf
->buffer
;
582 buf
->bufLimit
=buf
->buffer
;
583 T_FileStream_rewind(buf
->in
);
584 buf
->remaining
=T_FileStream_size(buf
->in
)-buf
->signatureLength
;
586 ucnv_resetToUnicode(buf
->conv
);
587 if(buf
->signatureLength
>0) {
588 UChar target
[1]={ 0 };
594 /* read the signature bytes */
595 numRead
=T_FileStream_read(buf
->in
, start
, buf
->signatureLength
);
597 /* convert and ignore initial U+FEFF, and the buffer overflow */
600 ucnv_toUnicode(buf
->conv
, &pTarget
, target
+1, &pStart
, start
+numRead
, NULL
, FALSE
, error
);
601 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
605 /* verify that we successfully read exactly U+FEFF */
606 if(U_SUCCESS(*error
) && (numRead
!=buf
->signatureLength
|| pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
607 *error
=U_INTERNAL_PROGRAM_ERROR
;
614 U_CAPI
int32_t U_EXPORT2
615 ucbuf_size(UCHARBUF
* buf
){
618 return (T_FileStream_size(buf
->in
)-buf
->signatureLength
)/ucnv_getMinCharSize(buf
->conv
);
620 return (int32_t)(buf
->bufLimit
- buf
->buffer
);
626 U_CAPI
const UChar
* U_EXPORT2
627 ucbuf_getBuffer(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* error
){
628 if(error
==NULL
|| U_FAILURE(*error
)){
631 if(buf
==NULL
|| len
==NULL
){
632 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
635 *len
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
639 U_CAPI
const char* U_EXPORT2
640 ucbuf_resolveFileName(const char* inputDir
, const char* fileName
, char* target
, int32_t* len
, UErrorCode
* status
){
641 int32_t requiredLen
= 0;
644 if(status
==NULL
|| U_FAILURE(*status
)){
648 if(inputDir
== NULL
|| fileName
== NULL
|| len
==NULL
|| (target
==NULL
&& *len
>0)){
649 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
654 dirlen
= (int32_t)uprv_strlen(inputDir
);
655 filelen
= (int32_t)uprv_strlen(fileName
);
656 if(inputDir
[dirlen
-1] != U_FILE_SEP_CHAR
) {
657 requiredLen
= dirlen
+ filelen
+ 2;
658 if((*len
< requiredLen
) || target
==NULL
){
660 *status
= U_BUFFER_OVERFLOW_ERROR
;
666 * append the input dir to openFileName if the first char in
667 * filename is not file seperation char and the last char input directory is not '.'.
668 * This is to support :
669 * genrb -s. /home/icu/data
671 * The user cannot mix notations like
672 * genrb -s. /icu/data --- the absolute path specified. -s redundant
674 * genrb -s. icu/data --- start from CWD and look in icu/data dir
676 if( (fileName
[0] != U_FILE_SEP_CHAR
) && (inputDir
[dirlen
-1] !='.')){
677 uprv_strcpy(target
, inputDir
);
678 target
[dirlen
] = U_FILE_SEP_CHAR
;
680 target
[dirlen
+ 1] = '\0';
682 requiredLen
= dirlen
+ filelen
+ 1;
683 if((*len
< requiredLen
) || target
==NULL
){
685 *status
= U_BUFFER_OVERFLOW_ERROR
;
689 uprv_strcpy(target
, inputDir
);
692 uprv_strcat(target
, fileName
);
696 * Unicode TR 13 says any of the below chars is
697 * a new line char in a readline function in addition
698 * to CR+LF combination which needs to be
701 static UBool
ucbuf_isCharNewLine(UChar c
){
703 case 0x000A: /* LF */
704 case 0x000D: /* CR */
705 case 0x000C: /* FF */
706 case 0x0085: /* NEL */
707 case 0x2028: /* LS */
708 case 0x2029: /* PS */
715 U_CAPI
const UChar
* U_EXPORT2
716 ucbuf_readline(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* err
){
717 UChar
* temp
= buf
->currentPos
;
718 UChar
* savePos
=NULL
;
721 /* The input is buffered we have to do more
722 * for returning a pointer U_TRUNCATED_CHAR_FOUND
726 if(buf
->remaining
==0){
727 return NULL
; /* end of file is reached return NULL */
729 if(temp
>=buf
->bufLimit
&& buf
->currentPos
== buf
->buffer
){
730 *err
= U_TRUNCATED_CHAR_FOUND
;
733 ucbuf_fillucbuf(buf
,err
);
739 * Accoding to TR 13 readLine functions must interpret
740 * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators
743 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
744 *len
= (int32_t)(temp
++ - buf
->currentPos
);
745 savePos
= buf
->currentPos
;
746 buf
->currentPos
= temp
;
751 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)){ /* Unipad inserts 2028 line separators! */
752 *len
= (int32_t)(temp
- buf
->currentPos
);
753 savePos
= buf
->currentPos
;
754 buf
->currentPos
= temp
;
759 /* we know that all input is read into the internal
760 * buffer so we can safely return pointers
765 if(buf
->currentPos
==buf
->bufLimit
){
766 return NULL
; /* end of file is reached return NULL */
769 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
770 *len
= (int32_t)(temp
++ - buf
->currentPos
);
771 savePos
= buf
->currentPos
;
772 buf
->currentPos
= temp
;
776 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)) { /* Unipad inserts 2028 line separators! */
777 *len
= (int32_t)(temp
- buf
->currentPos
);
778 savePos
= buf
->currentPos
;
779 buf
->currentPos
= temp
;
785 /* A compiler warning will appear if all paths don't contain a return statement. */