2 *******************************************************************************
4 * Copyright (C) 1998-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 05/10/01 Ram Creation.
15 *******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/putil.h"
20 #include "unicode/ucnv.h"
21 #include "unicode/ucnv_err.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
31 #define MAX_IN_BUF 1000
32 #define MAX_U_BUF 1500
33 #define CONTEXT_LEN 15
41 int32_t signatureLength
;
44 UBool showWarning
; /* makes this API not produce any errors */
48 U_CAPI UBool U_EXPORT2
49 ucbuf_autodetect_fs(FileStream
* in
, const char** cp
, UConverter
** conv
, int32_t* signatureLength
, UErrorCode
* error
){
53 UChar target
[1]={ 0 };
57 /* read a few bytes */
58 numRead
=T_FileStream_read(in
, start
, sizeof(start
));
60 *cp
= ucnv_detectUnicodeSignature(start
, numRead
, signatureLength
, error
);
62 /* unread the bytes beyond what was consumed for U+FEFF */
63 T_FileStream_rewind(in
);
64 if (*signatureLength
> 0) {
65 numRead
= T_FileStream_read(in
, start
, *signatureLength
);
73 /* open the converter for the detected Unicode charset */
74 *conv
= ucnv_open(*cp
,error
);
76 /* convert and ignore initial U+FEFF, and the buffer overflow */
79 ucnv_toUnicode(*conv
, &pTarget
, target
+1, &pStart
, start
+*signatureLength
, NULL
, FALSE
, error
);
80 *signatureLength
= (int32_t)(pStart
- start
);
81 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
85 /* verify that we successfully read exactly U+FEFF */
86 if(U_SUCCESS(*error
) && (pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
87 *error
=U_INTERNAL_PROGRAM_ERROR
;
93 static UBool
ucbuf_isCPKnown(const char* cp
){
94 if(ucnv_compareNames("UTF-8",cp
)==0){
97 if(ucnv_compareNames("UTF-16BE",cp
)==0){
100 if(ucnv_compareNames("UTF-16LE",cp
)==0){
103 if(ucnv_compareNames("UTF-16",cp
)==0){
106 if(ucnv_compareNames("UTF-32",cp
)==0){
109 if(ucnv_compareNames("UTF-32BE",cp
)==0){
112 if(ucnv_compareNames("UTF-32LE",cp
)==0){
115 if(ucnv_compareNames("SCSU",cp
)==0){
118 if(ucnv_compareNames("BOCU-1",cp
)==0){
121 if(ucnv_compareNames("UTF-7",cp
)==0){
127 U_CAPI FileStream
* U_EXPORT2
128 ucbuf_autodetect(const char* fileName
, const char** cp
,UConverter
** conv
, int32_t* signatureLength
,UErrorCode
* error
){
130 if(error
==NULL
|| U_FAILURE(*error
)){
133 if(conv
==NULL
|| cp
==NULL
|| fileName
==NULL
){
134 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
138 in
= T_FileStream_open(fileName
,"rb");
141 *error
=U_FILE_ACCESS_ERROR
;
145 if(ucbuf_autodetect_fs(in
,cp
,conv
,signatureLength
,error
)) {
150 T_FileStream_close(in
);
155 /* fill the uchar buffer */
157 ucbuf_fillucbuf( UCHARBUF
* buf
,UErrorCode
* error
){
160 const char* source
=NULL
;
161 char carr
[MAX_IN_BUF
] = {'\0'};
164 int32_t outputWritten
=0;
166 const char* sourceLimit
=NULL
;
168 pTarget
= buf
->buffer
;
169 /* check if we arrived here without exhausting the buffer*/
170 if(buf
->currentPos
<buf
->bufLimit
){
171 offset
= (int32_t)(buf
->bufLimit
-buf
->currentPos
);
172 memmove(buf
->buffer
,buf
->currentPos
,offset
* sizeof(UChar
));
176 memset(pTarget
+offset
,0xff,sizeof(UChar
)*(MAX_IN_BUF
-offset
));
179 cbufSize
= MAX_IN_BUF
;
181 inputRead
=T_FileStream_read(buf
->in
,cbuf
,cbufSize
-offset
);
182 buf
->remaining
-=inputRead
;
185 cbufSize
= T_FileStream_size(buf
->in
);
186 cbuf
= (char*)uprv_malloc(cbufSize
);
187 inputRead
= T_FileStream_read(buf
->in
,cbuf
,cbufSize
);
188 buf
->remaining
-=inputRead
;
191 /* just to be sure...*/
192 if ( 0 == inputRead
)
196 /* convert the bytes */
198 /* set the callback to stop */
199 UConverterToUCallback toUOldAction
;
201 void* toUNewContext
=NULL
;
202 ucnv_setToUCallBack(buf
->conv
,
203 UCNV_TO_U_CALLBACK_STOP
,
206 (const void**)&toUOldContext
,
208 /* since state is saved in the converter we add offset to source*/
209 target
= pTarget
+offset
;
211 sourceLimit
= source
+ inputRead
;
212 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
213 &source
,sourceLimit
,NULL
,
214 (UBool
)(buf
->remaining
==0),error
);
216 if(U_FAILURE(*error
)){
217 char context
[CONTEXT_LEN
];
218 char preContext
[CONTEXT_LEN
];
219 char postContext
[CONTEXT_LEN
];
220 int8_t len
= CONTEXT_LEN
;
224 /* use erro1 to preserve the error code */
225 UErrorCode error1
=U_ZERO_ERROR
;
227 if( buf
->showWarning
==TRUE
){
228 fprintf(stderr
,"\n###WARNING: Encountered abnormal bytes while"
229 " converting input stream to target encoding: %s\n",
230 u_errorName(*error
));
234 /* now get the context chars */
235 ucnv_getInvalidChars(buf
->conv
,context
,&len
,&error1
);
236 context
[len
]= 0 ; /* null terminate the buffer */
238 pos
= (int32_t)(source
- cbuf
- len
);
240 /* for pre-context */
241 start
= (pos
<=CONTEXT_LEN
)? 0 : (pos
- (CONTEXT_LEN
-1));
244 memcpy(preContext
,cbuf
+start
,stop
-start
);
245 /* null terminate the buffer */
246 preContext
[stop
-start
] = 0;
248 /* for post-context */
250 stop
= (int32_t)(((pos
+CONTEXT_LEN
)<= (sourceLimit
-cbuf
) )? (pos
+(CONTEXT_LEN
-1)) : (sourceLimit
-cbuf
));
252 memcpy(postContext
,source
,stop
-start
);
253 /* null terminate the buffer */
254 postContext
[stop
-start
] = 0;
256 if(buf
->showWarning
==TRUE
){
257 /* print out the context */
258 fprintf(stderr
,"\tPre-context: %s\n",preContext
);
259 fprintf(stderr
,"\tContext: %s\n",context
);
260 fprintf(stderr
,"\tPost-context: %s\n", postContext
);
263 /* reset the converter */
264 ucnv_reset(buf
->conv
);
266 /* set the call back to substitute
267 * and restart conversion
269 ucnv_setToUCallBack(buf
->conv
,
270 UCNV_TO_U_CALLBACK_SUBSTITUTE
,
273 (const void**)&toUOldContext
,
276 /* reset source and target start positions */
277 target
= pTarget
+offset
;
281 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
282 &source
,sourceLimit
,NULL
,
283 (UBool
)(buf
->remaining
==0),&error1
);
286 outputWritten
= (int32_t)(target
- pTarget
);
293 for(i
=0;i
<numRead
;i
++){
294 /* printf("%c", (char)(*target++));*/
300 u_charsToUChars(cbuf
,target
+offset
,inputRead
);
301 outputWritten
=((buf
->remaining
>cbufSize
)? cbufSize
:inputRead
+offset
);
303 buf
->currentPos
= pTarget
;
304 buf
->bufLimit
=pTarget
+outputWritten
;
305 *buf
->bufLimit
=0; /*NUL terminate*/
314 /* get a UChar from the stream*/
315 U_CAPI
int32_t U_EXPORT2
316 ucbuf_getc(UCHARBUF
* buf
,UErrorCode
* error
){
317 if(error
==NULL
|| U_FAILURE(*error
)){
320 if(buf
->currentPos
>=buf
->bufLimit
){
321 if(buf
->remaining
==0){
324 buf
=ucbuf_fillucbuf(buf
,error
);
325 if(U_FAILURE(*error
)){
330 return *(buf
->currentPos
++);
333 /* get a UChar32 from the stream*/
334 U_CAPI
int32_t U_EXPORT2
335 ucbuf_getc32(UCHARBUF
* buf
,UErrorCode
* error
){
336 int32_t retVal
= (int32_t)U_EOF
;
337 if(error
==NULL
|| U_FAILURE(*error
)){
340 if(buf
->currentPos
+1>=buf
->bufLimit
){
341 if(buf
->remaining
==0){
344 buf
=ucbuf_fillucbuf(buf
,error
);
345 if(U_FAILURE(*error
)){
349 if(UTF_IS_LEAD(*(buf
->currentPos
))){
350 retVal
=UTF16_GET_PAIR_VALUE(*(buf
->currentPos
++),*(buf
->currentPos
++));
352 retVal
= *(buf
->currentPos
++);
357 /* u_unescapeAt() callback to return a UChar*/
358 static UChar U_CALLCONV
359 _charAt(int32_t offset
, void *context
) {
360 return ((UCHARBUF
*) context
)->currentPos
[offset
];
363 /* getc and escape it */
364 U_CAPI
int32_t U_EXPORT2
365 ucbuf_getcx32(UCHARBUF
* buf
,UErrorCode
* error
) {
369 if(error
==NULL
|| U_FAILURE(*error
)){
372 /* Fill the buffer if it is empty */
373 if (buf
->currentPos
>=buf
->bufLimit
-2) {
374 ucbuf_fillucbuf(buf
,error
);
377 /* Get the next character in the buffer */
378 if (buf
->currentPos
< buf
->bufLimit
) {
379 c1
= *(buf
->currentPos
)++;
384 c2
= *(buf
->currentPos
);
386 /* If it isn't a backslash, return it */
391 /* Determine the amount of data in the buffer */
392 length
= (int32_t)(buf
->bufLimit
- buf
->currentPos
);
394 /* The longest escape sequence is \Uhhhhhhhh; make sure
395 we have at least that many characters */
398 /* fill the buffer */
399 ucbuf_fillucbuf(buf
,error
);
400 length
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
403 /* Process the escape */
405 c32
= u_unescapeAt(_charAt
, &offset
, length
, (void*)buf
);
407 /* check if u_unescapeAt unescaped and converted
411 if(buf
->showWarning
) {
417 context
[len
]= 0 ; /* null terminate the buffer */
418 u_UCharsToChars( buf
->currentPos
, context
, len
);
419 fprintf(stderr
,"Bad escape: [%c%s]...\n", (int)c1
, context
);
421 *error
= U_ILLEGAL_ESCAPE_SEQUENCE
;
423 }else if(c32
!=c2
|| (c32
==0x0075 && c2
==0x0075 && c1
==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){
424 /* Update the current buffer position */
425 buf
->currentPos
+= offset
;
427 /* unescaping failed so we just return
428 * c1 and not consume the buffer
429 * this is useful for rules with escapes
439 U_CAPI UCHARBUF
* U_EXPORT2
440 ucbuf_open(const char* fileName
,const char** cp
,UBool showWarning
, UBool buffered
, UErrorCode
* error
){
442 FileStream
* in
= NULL
;
445 if(error
==NULL
|| U_FAILURE(*error
)){
448 if(cp
==NULL
|| fileName
==NULL
){
449 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
452 if (!uprv_strcmp(fileName
, "-")) {
453 in
= T_FileStream_stdin();
455 in
= T_FileStream_open(fileName
, "rb");
459 UCHARBUF
* buf
=(UCHARBUF
*) uprv_malloc(sizeof(UCHARBUF
));
460 fileSize
= T_FileStream_size(in
);
464 buf
->showWarning
= showWarning
;
465 buf
->isBuffered
= buffered
;
466 buf
->signatureLength
=0;
467 if(*cp
==NULL
|| **cp
=='\0'){
468 /* don't have code page name... try to autodetect */
469 ucbuf_autodetect_fs(in
,cp
,&buf
->conv
,&buf
->signatureLength
,error
);
470 }else if(ucbuf_isCPKnown(*cp
)){
472 ucbuf_autodetect_fs(in
,&knownCp
,&buf
->conv
,&buf
->signatureLength
,error
);
474 if(U_SUCCESS(*error
) && buf
->conv
==NULL
) {
475 buf
->conv
=ucnv_open(*cp
,error
);
477 if(U_FAILURE(*error
)){
478 ucnv_close(buf
->conv
);
483 if((buf
->conv
==NULL
) && (buf
->showWarning
==TRUE
)){
484 fprintf(stderr
,"###WARNING: No converter defined. Using codepage of system.\n");
486 buf
->remaining
=fileSize
-buf
->signatureLength
;
488 buf
->bufCapacity
=MAX_U_BUF
;
490 buf
->bufCapacity
=buf
->remaining
+buf
->signatureLength
+1/*for terminating nul*/;
492 buf
->buffer
=(UChar
*) uprv_malloc(U_SIZEOF_UCHAR
* buf
->bufCapacity
);
493 if (buf
->buffer
== NULL
) {
494 *error
= U_MEMORY_ALLOCATION_ERROR
;
497 buf
->currentPos
=buf
->buffer
;
498 buf
->bufLimit
=buf
->buffer
;
499 if(U_FAILURE(*error
)){
500 fprintf(stderr
, "Could not open codepage [%s]: %s\n", *cp
, u_errorName(*error
));
503 buf
=ucbuf_fillucbuf(buf
,error
);
506 *error
= U_MEMORY_ALLOCATION_ERROR
;
511 *error
=U_FILE_ACCESS_ERROR
;
517 /* TODO: this method will fail if at the
518 * begining of buffer and the uchar to unget
519 * is from the previous buffer. Need to implement
520 * system to take care of that situation.
522 U_CAPI
void U_EXPORT2
523 ucbuf_ungetc(int32_t c
,UCHARBUF
* buf
){
524 /* decrement currentPos pointer
525 * if not at the begining of buffer
527 UChar escaped
[8] ={'\0'};
530 len
= uprv_itou(escaped
,8,c
,16,8);
532 len
=uprv_itou(escaped
,8,c
,16,4);
534 if(buf
->currentPos
!=buf
->buffer
){
535 if(*(buf
->currentPos
-1)==c
){
537 }else if(u_strncmp(buf
->currentPos
-len
,escaped
,len
) == 0){
545 /* frees the resources of UChar* buffer */
547 ucbuf_closebuf(UCHARBUF
* buf
){
548 uprv_free(buf
->buffer
);
552 /* close the buf and release resources*/
553 U_CAPI
void U_EXPORT2
554 ucbuf_close(UCHARBUF
* buf
){
557 ucnv_close(buf
->conv
);
559 T_FileStream_close(buf
->in
);
565 /* rewind the buf and file stream */
566 U_CAPI
void U_EXPORT2
567 ucbuf_rewind(UCHARBUF
* buf
,UErrorCode
* error
){
568 if(error
==NULL
|| U_FAILURE(*error
)){
572 buf
->currentPos
=buf
->buffer
;
573 buf
->bufLimit
=buf
->buffer
;
574 T_FileStream_rewind(buf
->in
);
575 buf
->remaining
=T_FileStream_size(buf
->in
)-buf
->signatureLength
;
577 ucnv_resetToUnicode(buf
->conv
);
578 if(buf
->signatureLength
>0) {
579 UChar target
[1]={ 0 };
585 /* read the signature bytes */
586 numRead
=T_FileStream_read(buf
->in
, start
, buf
->signatureLength
);
588 /* convert and ignore initial U+FEFF, and the buffer overflow */
591 ucnv_toUnicode(buf
->conv
, &pTarget
, target
+1, &pStart
, start
+numRead
, NULL
, FALSE
, error
);
592 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
596 /* verify that we successfully read exactly U+FEFF */
597 if(U_SUCCESS(*error
) && (numRead
!=buf
->signatureLength
|| pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
598 *error
=U_INTERNAL_PROGRAM_ERROR
;
605 U_CAPI
int32_t U_EXPORT2
606 ucbuf_size(UCHARBUF
* buf
){
609 return (T_FileStream_size(buf
->in
)-buf
->signatureLength
)/ucnv_getMinCharSize(buf
->conv
);
611 return (int32_t)(buf
->bufLimit
- buf
->buffer
);
617 U_CAPI
const UChar
* U_EXPORT2
618 ucbuf_getBuffer(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* error
){
619 if(error
==NULL
|| U_FAILURE(*error
)){
622 if(buf
==NULL
|| len
==NULL
){
623 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
626 *len
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
630 U_CAPI
const char* U_EXPORT2
631 ucbuf_resolveFileName(const char* inputDir
, const char* fileName
, char* target
, int32_t* len
, UErrorCode
* status
){
632 int32_t requiredLen
= 0;
635 if(status
==NULL
|| U_FAILURE(*status
)){
639 if(inputDir
== NULL
|| fileName
== NULL
|| len
==NULL
|| (target
==NULL
&& *len
>0)){
640 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
645 dirlen
= (int32_t)uprv_strlen(inputDir
);
646 filelen
= (int32_t)uprv_strlen(fileName
);
647 if(inputDir
[dirlen
-1] != U_FILE_SEP_CHAR
) {
648 requiredLen
= dirlen
+ filelen
+ 2;
649 if((*len
< requiredLen
) || target
==NULL
){
651 *status
= U_BUFFER_OVERFLOW_ERROR
;
657 * append the input dir to openFileName if the first char in
658 * filename is not file seperation char and the last char input directory is not '.'.
659 * This is to support :
660 * genrb -s. /home/icu/data
662 * The user cannot mix notations like
663 * genrb -s. /icu/data --- the absolute path specified. -s redundant
665 * genrb -s. icu/data --- start from CWD and look in icu/data dir
667 if( (fileName
[0] != U_FILE_SEP_CHAR
) && (inputDir
[dirlen
-1] !='.')){
668 uprv_strcpy(target
, inputDir
);
669 target
[dirlen
] = U_FILE_SEP_CHAR
;
671 target
[dirlen
+ 1] = '\0';
673 requiredLen
= dirlen
+ filelen
+ 1;
674 if((*len
< requiredLen
) || target
==NULL
){
676 *status
= U_BUFFER_OVERFLOW_ERROR
;
680 uprv_strcpy(target
, inputDir
);
683 uprv_strcat(target
, fileName
);
687 * Unicode TR 13 says any of the below chars is
688 * a new line char in a readline function in addition
689 * to CR+LF combination which needs to be
692 static UBool
ucbuf_isCharNewLine(UChar c
){
694 case 0x000A: /* LF */
695 case 0x000D: /* CR */
696 case 0x000C: /* FF */
697 case 0x0085: /* NEL */
698 case 0x2028: /* LS */
699 case 0x2029: /* PS */
706 U_CAPI
const UChar
* U_EXPORT2
707 ucbuf_readline(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* err
){
708 UChar
* temp
= buf
->currentPos
;
709 UChar
* savePos
=NULL
;
712 /* The input is buffered we have to do more
713 * for returning a pointer U_TRUNCATED_CHAR_FOUND
717 if(buf
->remaining
==0){
718 return NULL
; /* end of file is reached return NULL */
720 if(temp
>=buf
->bufLimit
&& buf
->currentPos
== buf
->buffer
){
721 *err
= U_TRUNCATED_CHAR_FOUND
;
724 ucbuf_fillucbuf(buf
,err
);
730 * Accoding to TR 13 readLine functions must interpret
731 * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators
734 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
735 *len
= (int32_t)(temp
++ - buf
->currentPos
);
736 savePos
= buf
->currentPos
;
737 buf
->currentPos
= temp
;
742 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)){ /* Unipad inserts 2028 line separators! */
743 *len
= (int32_t)(temp
- buf
->currentPos
);
744 savePos
= buf
->currentPos
;
745 buf
->currentPos
= temp
;
750 /* we know that all input is read into the internal
751 * buffer so we can safely return pointers
756 if(buf
->currentPos
==buf
->bufLimit
){
757 return NULL
; /* end of file is reached return NULL */
760 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
761 *len
= (int32_t)(temp
++ - buf
->currentPos
);
762 savePos
= buf
->currentPos
;
763 buf
->currentPos
= temp
;
767 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)) { /* Unipad inserts 2028 line separators! */
768 *len
= (int32_t)(temp
- buf
->currentPos
);
769 savePos
= buf
->currentPos
;
770 buf
->currentPos
= temp
;
776 /* A compiler warning will appear if all paths don't contain a return statement. */