2 *******************************************************************************
4 * Copyright (C) 1998-2008, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
11 * Modification History:
13 * Date Name Description
14 * 05/10/01 Ram Creation.
15 *******************************************************************************
18 #include "unicode/utypes.h"
19 #include "unicode/putil.h"
20 #include "unicode/ucnv.h"
21 #include "unicode/ucnv_err.h"
26 #include "unicode/ustring.h"
27 #include "unicode/uchar.h"
31 #if !UCONFIG_NO_CONVERSION
34 #define MAX_IN_BUF 1000
35 #define MAX_U_BUF 1500
36 #define CONTEXT_LEN 20
44 int32_t signatureLength
;
47 UBool showWarning
; /* makes this API not produce any errors */
51 U_CAPI UBool U_EXPORT2
52 ucbuf_autodetect_fs(FileStream
* in
, const char** cp
, UConverter
** conv
, int32_t* signatureLength
, UErrorCode
* error
){
56 UChar target
[1]={ 0 };
60 /* read a few bytes */
61 numRead
=T_FileStream_read(in
, start
, sizeof(start
));
63 *cp
= ucnv_detectUnicodeSignature(start
, numRead
, signatureLength
, error
);
65 /* unread the bytes beyond what was consumed for U+FEFF */
66 T_FileStream_rewind(in
);
67 if (*signatureLength
> 0) {
68 numRead
= T_FileStream_read(in
, start
, *signatureLength
);
76 /* open the converter for the detected Unicode charset */
77 *conv
= ucnv_open(*cp
,error
);
79 /* convert and ignore initial U+FEFF, and the buffer overflow */
82 ucnv_toUnicode(*conv
, &pTarget
, target
+1, &pStart
, start
+*signatureLength
, NULL
, FALSE
, error
);
83 *signatureLength
= (int32_t)(pStart
- start
);
84 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
88 /* verify that we successfully read exactly U+FEFF */
89 if(U_SUCCESS(*error
) && (pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
90 *error
=U_INTERNAL_PROGRAM_ERROR
;
96 static UBool
ucbuf_isCPKnown(const char* cp
){
97 if(ucnv_compareNames("UTF-8",cp
)==0){
100 if(ucnv_compareNames("UTF-16BE",cp
)==0){
103 if(ucnv_compareNames("UTF-16LE",cp
)==0){
106 if(ucnv_compareNames("UTF-16",cp
)==0){
109 if(ucnv_compareNames("UTF-32",cp
)==0){
112 if(ucnv_compareNames("UTF-32BE",cp
)==0){
115 if(ucnv_compareNames("UTF-32LE",cp
)==0){
118 if(ucnv_compareNames("SCSU",cp
)==0){
121 if(ucnv_compareNames("BOCU-1",cp
)==0){
124 if(ucnv_compareNames("UTF-7",cp
)==0){
130 U_CAPI FileStream
* U_EXPORT2
131 ucbuf_autodetect(const char* fileName
, const char** cp
,UConverter
** conv
, int32_t* signatureLength
,UErrorCode
* error
){
133 if(error
==NULL
|| U_FAILURE(*error
)){
136 if(conv
==NULL
|| cp
==NULL
|| fileName
==NULL
){
137 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
141 in
= T_FileStream_open(fileName
,"rb");
144 *error
=U_FILE_ACCESS_ERROR
;
148 if(ucbuf_autodetect_fs(in
,cp
,conv
,signatureLength
,error
)) {
153 T_FileStream_close(in
);
158 /* fill the uchar buffer */
160 ucbuf_fillucbuf( UCHARBUF
* buf
,UErrorCode
* error
){
163 const char* source
=NULL
;
164 char carr
[MAX_IN_BUF
] = {'\0'};
167 int32_t outputWritten
=0;
169 const char* sourceLimit
=NULL
;
171 pTarget
= buf
->buffer
;
172 /* check if we arrived here without exhausting the buffer*/
173 if(buf
->currentPos
<buf
->bufLimit
){
174 offset
= (int32_t)(buf
->bufLimit
-buf
->currentPos
);
175 memmove(buf
->buffer
,buf
->currentPos
,offset
* sizeof(UChar
));
179 memset(pTarget
+offset
,0xff,sizeof(UChar
)*(MAX_IN_BUF
-offset
));
182 cbufSize
= MAX_IN_BUF
;
184 inputRead
=T_FileStream_read(buf
->in
,cbuf
,cbufSize
-offset
);
185 buf
->remaining
-=inputRead
;
188 cbufSize
= T_FileStream_size(buf
->in
);
189 cbuf
= (char*)uprv_malloc(cbufSize
);
191 *error
= U_MEMORY_ALLOCATION_ERROR
;
194 inputRead
= T_FileStream_read(buf
->in
,cbuf
,cbufSize
);
195 buf
->remaining
-=inputRead
;
198 /* just to be sure...*/
199 if ( 0 == inputRead
)
203 /* convert the bytes */
205 /* set the callback to stop */
206 UConverterToUCallback toUOldAction
;
208 void* toUNewContext
=NULL
;
209 ucnv_setToUCallBack(buf
->conv
,
210 UCNV_TO_U_CALLBACK_STOP
,
213 (const void**)&toUOldContext
,
215 /* since state is saved in the converter we add offset to source*/
216 target
= pTarget
+offset
;
218 sourceLimit
= source
+ inputRead
;
219 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
220 &source
,sourceLimit
,NULL
,
221 (UBool
)(buf
->remaining
==0),error
);
223 if(U_FAILURE(*error
)){
224 char context
[CONTEXT_LEN
+1];
225 char preContext
[CONTEXT_LEN
+1];
226 char postContext
[CONTEXT_LEN
+1];
227 int8_t len
= CONTEXT_LEN
;
231 /* use erro1 to preserve the error code */
232 UErrorCode error1
=U_ZERO_ERROR
;
234 if( buf
->showWarning
==TRUE
){
235 fprintf(stderr
,"\n###WARNING: Encountered abnormal bytes while"
236 " converting input stream to target encoding: %s\n",
237 u_errorName(*error
));
241 /* now get the context chars */
242 ucnv_getInvalidChars(buf
->conv
,context
,&len
,&error1
);
243 context
[len
]= 0 ; /* null terminate the buffer */
245 pos
= (int32_t)(source
- cbuf
- len
);
247 /* for pre-context */
248 start
= (pos
<=CONTEXT_LEN
)? 0 : (pos
- (CONTEXT_LEN
-1));
251 memcpy(preContext
,cbuf
+start
,stop
-start
);
252 /* null terminate the buffer */
253 preContext
[stop
-start
] = 0;
255 /* for post-context */
257 stop
= (int32_t)(((pos
+CONTEXT_LEN
)<= (sourceLimit
-cbuf
) )? (pos
+(CONTEXT_LEN
-1)) : (sourceLimit
-cbuf
));
259 memcpy(postContext
,source
,stop
-start
);
260 /* null terminate the buffer */
261 postContext
[stop
-start
] = 0;
263 if(buf
->showWarning
==TRUE
){
264 /* print out the context */
265 fprintf(stderr
,"\tPre-context: %s\n",preContext
);
266 fprintf(stderr
,"\tContext: %s\n",context
);
267 fprintf(stderr
,"\tPost-context: %s\n", postContext
);
270 /* reset the converter */
271 ucnv_reset(buf
->conv
);
273 /* set the call back to substitute
274 * and restart conversion
276 ucnv_setToUCallBack(buf
->conv
,
277 UCNV_TO_U_CALLBACK_SUBSTITUTE
,
280 (const void**)&toUOldContext
,
283 /* reset source and target start positions */
284 target
= pTarget
+offset
;
288 ucnv_toUnicode(buf
->conv
,&target
,target
+(buf
->bufCapacity
-offset
),
289 &source
,sourceLimit
,NULL
,
290 (UBool
)(buf
->remaining
==0),&error1
);
293 outputWritten
= (int32_t)(target
- pTarget
);
300 for(i
=0;i
<numRead
;i
++){
301 /* printf("%c", (char)(*target++));*/
307 u_charsToUChars(cbuf
,target
+offset
,inputRead
);
308 outputWritten
=((buf
->remaining
>cbufSize
)? cbufSize
:inputRead
+offset
);
310 buf
->currentPos
= pTarget
;
311 buf
->bufLimit
=pTarget
+outputWritten
;
312 *buf
->bufLimit
=0; /*NUL terminate*/
321 /* get a UChar from the stream*/
322 U_CAPI
int32_t U_EXPORT2
323 ucbuf_getc(UCHARBUF
* buf
,UErrorCode
* error
){
324 if(error
==NULL
|| U_FAILURE(*error
)){
327 if(buf
->currentPos
>=buf
->bufLimit
){
328 if(buf
->remaining
==0){
331 buf
=ucbuf_fillucbuf(buf
,error
);
332 if(U_FAILURE(*error
)){
337 return *(buf
->currentPos
++);
340 /* get a UChar32 from the stream*/
341 U_CAPI
int32_t U_EXPORT2
342 ucbuf_getc32(UCHARBUF
* buf
,UErrorCode
* error
){
343 int32_t retVal
= (int32_t)U_EOF
;
344 if(error
==NULL
|| U_FAILURE(*error
)){
347 if(buf
->currentPos
+1>=buf
->bufLimit
){
348 if(buf
->remaining
==0){
351 buf
=ucbuf_fillucbuf(buf
,error
);
352 if(U_FAILURE(*error
)){
356 if(UTF_IS_LEAD(*(buf
->currentPos
))){
357 retVal
=UTF16_GET_PAIR_VALUE(buf
->currentPos
[0],buf
->currentPos
[1]);
360 retVal
= *(buf
->currentPos
++);
365 /* u_unescapeAt() callback to return a UChar*/
366 static UChar U_CALLCONV
367 _charAt(int32_t offset
, void *context
) {
368 return ((UCHARBUF
*) context
)->currentPos
[offset
];
371 /* getc and escape it */
372 U_CAPI
int32_t U_EXPORT2
373 ucbuf_getcx32(UCHARBUF
* buf
,UErrorCode
* error
) {
377 if(error
==NULL
|| U_FAILURE(*error
)){
380 /* Fill the buffer if it is empty */
381 if (buf
->currentPos
>=buf
->bufLimit
-2) {
382 ucbuf_fillucbuf(buf
,error
);
385 /* Get the next character in the buffer */
386 if (buf
->currentPos
< buf
->bufLimit
) {
387 c1
= *(buf
->currentPos
)++;
392 c2
= *(buf
->currentPos
);
394 /* If it isn't a backslash, return it */
399 /* Determine the amount of data in the buffer */
400 length
= (int32_t)(buf
->bufLimit
- buf
->currentPos
);
402 /* The longest escape sequence is \Uhhhhhhhh; make sure
403 we have at least that many characters */
406 /* fill the buffer */
407 ucbuf_fillucbuf(buf
,error
);
408 length
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
411 /* Process the escape */
413 c32
= u_unescapeAt(_charAt
, &offset
, length
, (void*)buf
);
415 /* check if u_unescapeAt unescaped and converted
419 if(buf
->showWarning
) {
420 char context
[CONTEXT_LEN
+1];
421 int32_t len
= CONTEXT_LEN
;
425 context
[len
]= 0 ; /* null terminate the buffer */
426 u_UCharsToChars( buf
->currentPos
, context
, len
);
427 fprintf(stderr
,"Bad escape: [%c%s]...\n", (int)c1
, context
);
429 *error
= U_ILLEGAL_ESCAPE_SEQUENCE
;
431 }else if(c32
!=c2
|| (c32
==0x0075 && c2
==0x0075 && c1
==0x005C) /* for \u0075 c2=0x0075 and c32==0x0075*/){
432 /* Update the current buffer position */
433 buf
->currentPos
+= offset
;
435 /* unescaping failed so we just return
436 * c1 and not consume the buffer
437 * this is useful for rules with escapes
447 U_CAPI UCHARBUF
* U_EXPORT2
448 ucbuf_open(const char* fileName
,const char** cp
,UBool showWarning
, UBool buffered
, UErrorCode
* error
){
450 FileStream
* in
= NULL
;
453 if(error
==NULL
|| U_FAILURE(*error
)){
456 if(cp
==NULL
|| fileName
==NULL
){
457 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
460 if (!uprv_strcmp(fileName
, "-")) {
461 in
= T_FileStream_stdin();
463 in
= T_FileStream_open(fileName
, "rb");
467 UCHARBUF
* buf
=(UCHARBUF
*) uprv_malloc(sizeof(UCHARBUF
));
468 fileSize
= T_FileStream_size(in
);
470 *error
= U_MEMORY_ALLOCATION_ERROR
;
471 T_FileStream_close(in
);
476 buf
->showWarning
= showWarning
;
477 buf
->isBuffered
= buffered
;
478 buf
->signatureLength
=0;
479 if(*cp
==NULL
|| **cp
=='\0'){
480 /* don't have code page name... try to autodetect */
481 ucbuf_autodetect_fs(in
,cp
,&buf
->conv
,&buf
->signatureLength
,error
);
482 }else if(ucbuf_isCPKnown(*cp
)){
484 ucbuf_autodetect_fs(in
,&knownCp
,&buf
->conv
,&buf
->signatureLength
,error
);
486 if(U_SUCCESS(*error
) && buf
->conv
==NULL
) {
487 buf
->conv
=ucnv_open(*cp
,error
);
489 if(U_FAILURE(*error
)){
490 ucnv_close(buf
->conv
);
492 T_FileStream_close(in
);
496 if((buf
->conv
==NULL
) && (buf
->showWarning
==TRUE
)){
497 fprintf(stderr
,"###WARNING: No converter defined. Using codepage of system.\n");
499 buf
->remaining
=fileSize
-buf
->signatureLength
;
501 buf
->bufCapacity
=MAX_U_BUF
;
503 buf
->bufCapacity
=buf
->remaining
+buf
->signatureLength
+1/*for terminating nul*/;
505 buf
->buffer
=(UChar
*) uprv_malloc(U_SIZEOF_UCHAR
* buf
->bufCapacity
);
506 if (buf
->buffer
== NULL
) {
507 *error
= U_MEMORY_ALLOCATION_ERROR
;
511 buf
->currentPos
=buf
->buffer
;
512 buf
->bufLimit
=buf
->buffer
;
513 if(U_FAILURE(*error
)){
514 fprintf(stderr
, "Could not open codepage [%s]: %s\n", *cp
, u_errorName(*error
));
518 ucbuf_fillucbuf(buf
,error
);
519 if(U_FAILURE(*error
)){
525 *error
=U_FILE_ACCESS_ERROR
;
531 /* TODO: this method will fail if at the
532 * begining of buffer and the uchar to unget
533 * is from the previous buffer. Need to implement
534 * system to take care of that situation.
536 U_CAPI
void U_EXPORT2
537 ucbuf_ungetc(int32_t c
,UCHARBUF
* buf
){
538 /* decrement currentPos pointer
539 * if not at the begining of buffer
541 if(buf
->currentPos
!=buf
->buffer
){
542 if(*(buf
->currentPos
-1)==c
){
545 /* ungetc failed - did not match. */
548 /* ungetc failed - beginning of buffer. */
552 /* frees the resources of UChar* buffer */
554 ucbuf_closebuf(UCHARBUF
* buf
){
555 uprv_free(buf
->buffer
);
559 /* close the buf and release resources*/
560 U_CAPI
void U_EXPORT2
561 ucbuf_close(UCHARBUF
* buf
){
564 ucnv_close(buf
->conv
);
566 T_FileStream_close(buf
->in
);
572 /* rewind the buf and file stream */
573 U_CAPI
void U_EXPORT2
574 ucbuf_rewind(UCHARBUF
* buf
,UErrorCode
* error
){
575 if(error
==NULL
|| U_FAILURE(*error
)){
579 buf
->currentPos
=buf
->buffer
;
580 buf
->bufLimit
=buf
->buffer
;
581 T_FileStream_rewind(buf
->in
);
582 buf
->remaining
=T_FileStream_size(buf
->in
)-buf
->signatureLength
;
584 ucnv_resetToUnicode(buf
->conv
);
585 if(buf
->signatureLength
>0) {
586 UChar target
[1]={ 0 };
592 /* read the signature bytes */
593 numRead
=T_FileStream_read(buf
->in
, start
, buf
->signatureLength
);
595 /* convert and ignore initial U+FEFF, and the buffer overflow */
598 ucnv_toUnicode(buf
->conv
, &pTarget
, target
+1, &pStart
, start
+numRead
, NULL
, FALSE
, error
);
599 if(*error
==U_BUFFER_OVERFLOW_ERROR
) {
603 /* verify that we successfully read exactly U+FEFF */
604 if(U_SUCCESS(*error
) && (numRead
!=buf
->signatureLength
|| pTarget
!=(target
+1) || target
[0]!=0xfeff)) {
605 *error
=U_INTERNAL_PROGRAM_ERROR
;
612 U_CAPI
int32_t U_EXPORT2
613 ucbuf_size(UCHARBUF
* buf
){
616 return (T_FileStream_size(buf
->in
)-buf
->signatureLength
)/ucnv_getMinCharSize(buf
->conv
);
618 return (int32_t)(buf
->bufLimit
- buf
->buffer
);
624 U_CAPI
const UChar
* U_EXPORT2
625 ucbuf_getBuffer(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* error
){
626 if(error
==NULL
|| U_FAILURE(*error
)){
629 if(buf
==NULL
|| len
==NULL
){
630 *error
= U_ILLEGAL_ARGUMENT_ERROR
;
633 *len
= (int32_t)(buf
->bufLimit
- buf
->buffer
);
637 U_CAPI
const char* U_EXPORT2
638 ucbuf_resolveFileName(const char* inputDir
, const char* fileName
, char* target
, int32_t* len
, UErrorCode
* status
){
639 int32_t requiredLen
= 0;
642 if(status
==NULL
|| U_FAILURE(*status
)){
646 if(inputDir
== NULL
|| fileName
== NULL
|| len
==NULL
|| (target
==NULL
&& *len
>0)){
647 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
652 dirlen
= (int32_t)uprv_strlen(inputDir
);
653 filelen
= (int32_t)uprv_strlen(fileName
);
654 if(inputDir
[dirlen
-1] != U_FILE_SEP_CHAR
) {
655 requiredLen
= dirlen
+ filelen
+ 2;
656 if((*len
< requiredLen
) || target
==NULL
){
658 *status
= U_BUFFER_OVERFLOW_ERROR
;
664 * append the input dir to openFileName if the first char in
665 * filename is not file seperation char and the last char input directory is not '.'.
666 * This is to support :
667 * genrb -s. /home/icu/data
669 * The user cannot mix notations like
670 * genrb -s. /icu/data --- the absolute path specified. -s redundant
672 * genrb -s. icu/data --- start from CWD and look in icu/data dir
674 if( (fileName
[0] != U_FILE_SEP_CHAR
) && (inputDir
[dirlen
-1] !='.')){
675 uprv_strcpy(target
, inputDir
);
676 target
[dirlen
] = U_FILE_SEP_CHAR
;
678 target
[dirlen
+ 1] = '\0';
680 requiredLen
= dirlen
+ filelen
+ 1;
681 if((*len
< requiredLen
) || target
==NULL
){
683 *status
= U_BUFFER_OVERFLOW_ERROR
;
687 uprv_strcpy(target
, inputDir
);
690 uprv_strcat(target
, fileName
);
694 * Unicode TR 13 says any of the below chars is
695 * a new line char in a readline function in addition
696 * to CR+LF combination which needs to be
699 static UBool
ucbuf_isCharNewLine(UChar c
){
701 case 0x000A: /* LF */
702 case 0x000D: /* CR */
703 case 0x000C: /* FF */
704 case 0x0085: /* NEL */
705 case 0x2028: /* LS */
706 case 0x2029: /* PS */
713 U_CAPI
const UChar
* U_EXPORT2
714 ucbuf_readline(UCHARBUF
* buf
,int32_t* len
,UErrorCode
* err
){
715 UChar
* temp
= buf
->currentPos
;
716 UChar
* savePos
=NULL
;
719 /* The input is buffered we have to do more
720 * for returning a pointer U_TRUNCATED_CHAR_FOUND
724 if(buf
->remaining
==0){
725 return NULL
; /* end of file is reached return NULL */
727 if(temp
>=buf
->bufLimit
&& buf
->currentPos
== buf
->buffer
){
728 *err
= U_TRUNCATED_CHAR_FOUND
;
731 ucbuf_fillucbuf(buf
,err
);
737 * Accoding to TR 13 readLine functions must interpret
738 * CR, CR+LF, LF, NEL, PS, LS or FF as line seperators
741 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
742 *len
= (int32_t)(temp
++ - buf
->currentPos
);
743 savePos
= buf
->currentPos
;
744 buf
->currentPos
= temp
;
749 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)){ /* Unipad inserts 2028 line separators! */
750 *len
= (int32_t)(temp
- buf
->currentPos
);
751 savePos
= buf
->currentPos
;
752 buf
->currentPos
= temp
;
757 /* we know that all input is read into the internal
758 * buffer so we can safely return pointers
763 if(buf
->currentPos
==buf
->bufLimit
){
764 return NULL
; /* end of file is reached return NULL */
767 if(c
==0x0d && temp
+1<=buf
->bufLimit
&& *(temp
+1) == 0x0a ){
768 *len
= (int32_t)(temp
++ - buf
->currentPos
);
769 savePos
= buf
->currentPos
;
770 buf
->currentPos
= temp
;
774 if (temp
>=buf
->bufLimit
|| ucbuf_isCharNewLine(c
)) { /* Unipad inserts 2028 line separators! */
775 *len
= (int32_t)(temp
- buf
->currentPos
);
776 savePos
= buf
->currentPos
;
777 buf
->currentPos
= temp
;
783 /* A compiler warning will appear if all paths don't contain a return statement. */