2 *******************************************************************************
4 * Copyright (C) 2003-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
10 * tab size: 8 (not used)
13 * created on: 2003feb1
14 * created by: Ram Viswanadha
17 #include "unicode/utypes.h"
21 #include "unicode/uidna.h"
22 #include "unicode/ustring.h"
23 #include "unicode/usprep.h"
29 /* it is official IDNA ACE Prefix is "xn--" */
30 static const UChar ACE_PREFIX
[] ={ 0x0078,0x006E,0x002d,0x002d } ;
31 #define ACE_PREFIX_LENGTH 4
33 #define MAX_LABEL_LENGTH 63
35 /* The Max length of the labels should not be more than 64 */
36 #define MAX_LABEL_BUFFER_SIZE 100
37 #define MAX_IDN_BUFFER_SIZE 300
39 #define CAPITAL_A 0x0041
40 #define CAPITAL_Z 0x005A
41 #define LOWER_CASE_DELTA 0x0020
42 #define FULL_STOP 0x002E
43 #define DATA_FILE_NAME "uidna"
46 toASCIILower(UChar ch
){
47 if(CAPITAL_A
<= ch
&& ch
<= CAPITAL_Z
){
48 return ch
+ LOWER_CASE_DELTA
;
54 startsWithPrefix(const UChar
* src
, int32_t srcLength
){
55 UBool startsWithPrefix
= TRUE
;
57 if(srcLength
< ACE_PREFIX_LENGTH
){
61 for(int8_t i
=0; i
< ACE_PREFIX_LENGTH
; i
++){
62 if(toASCIILower(src
[i
]) != ACE_PREFIX
[i
]){
63 startsWithPrefix
= FALSE
;
66 return startsWithPrefix
;
70 toASCIILower(UChar
* src
, int32_t srcLen
){
71 for(int32_t i
=0; i
<srcLen
; i
++){
72 src
[i
] = toASCIILower(src
[i
]);
77 compareCaseInsensitiveASCII(const UChar
* s1
, int32_t s1Len
,
78 const UChar
* s2
, int32_t s2Len
){
83 // are we comparing different lengths?
93 // ok the lengths are equal
101 for(int32_t i
=0;/* no condition */;i
++) {
103 /* If we reach the ends of both strings then they match */
111 /* Case-insensitive comparison */
113 rc
=(int32_t)toASCIILower(c1
)-(int32_t)toASCIILower(c2
);
125 * Ascertain if the given code point is a label separator as
126 * defined by the IDNA RFC
128 * @param ch The code point to be ascertained
129 * @return true if the char is a label separator
132 static inline UBool
isLabelSeparator(UChar ch
){
144 // returns the length of the label excluding the separator
145 // if *limit == separator then the length returned does not include
147 static inline int32_t
148 getNextSeparator(UChar
*src
,int32_t srcLength
,
155 *limit
= src
+ i
; // point to null
159 if(isLabelSeparator(src
[i
])){
160 *limit
= src
+ (i
+1); // go past the delimiter
167 for(i
=0;i
<srcLength
;i
++){
168 if(isLabelSeparator(src
[i
])){
169 *limit
= src
+ (i
+1); // go past the delimiter
173 // we have not found the delimiter
175 *limit
= src
+srcLength
;
181 static inline UBool
isLDHChar(UChar ch
){
186 //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
188 (0x0030 <= ch
&& ch
<= 0x0039) ||
189 (0x0041 <= ch
&& ch
<= 0x005A) ||
190 (0x0061 <= ch
&& ch
<= 0x007A)
198 _internal_toASCII(const UChar
* src
, int32_t srcLength
,
199 UChar
* dest
, int32_t destCapacity
,
201 UStringPrepProfile
* nameprep
,
202 UParseError
* parseError
,
205 UChar b1Stack
[MAX_LABEL_BUFFER_SIZE
], b2Stack
[MAX_LABEL_BUFFER_SIZE
];
206 //initialize pointers to stack buffers
207 UChar
*b1
= b1Stack
, *b2
= b2Stack
;
208 int32_t b1Len
, b2Len
,
209 b1Capacity
= MAX_LABEL_BUFFER_SIZE
,
210 b2Capacity
= MAX_LABEL_BUFFER_SIZE
,
213 int32_t namePrepOptions
= ((options
& UIDNA_ALLOW_UNASSIGNED
) != 0) ? USPREP_ALLOW_UNASSIGNED
: 0;
214 UBool
* caseFlags
= NULL
;
216 // the source contains all ascii codepoints
217 UBool srcIsASCII
= TRUE
;
218 // assume the source contains all LDH codepoints
219 UBool srcIsLDH
= TRUE
;
224 UBool useSTD3ASCIIRules
= (UBool
)((options
& UIDNA_USE_STD3_RULES
) != 0);
226 int32_t failPos
= -1;
229 b1Len
= usprep_prepare(nameprep
, src
, srcLength
, b1
, b1Capacity
, namePrepOptions
, parseError
, status
);
231 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
232 // redo processing of string
233 // we do not have enough room so grow the buffer
234 b1
= (UChar
*) uprv_malloc(b1Len
* U_SIZEOF_UCHAR
);
236 *status
= U_MEMORY_ALLOCATION_ERROR
;
240 *status
= U_ZERO_ERROR
; // reset error
242 b1Len
= usprep_prepare(nameprep
, src
, srcLength
, b1
, b1Len
, namePrepOptions
, parseError
, status
);
245 if(U_FAILURE(*status
)){
250 for( j
=0;j
<b1Len
;j
++){
253 }else if(isLDHChar(b1
[j
])==FALSE
){ // if the char is in ASCII range verify that it is an LDH character
259 if(useSTD3ASCIIRules
== TRUE
){
261 // 3(a) Verify the absence of non-LDH ASCII code points; that is, the
262 // absence of 0..2C, 2E..2F, 3A..40, 5B..60, and 7B..7F.
263 // 3(b) Verify the absence of leading and trailing hyphen-minus; that
264 // is, the absence of U+002D at the beginning and end of the
266 if( srcIsLDH
== FALSE
/* source at this point should not contain anyLDH characters */
267 || b1
[0] == HYPHEN
|| b1
[b1Len
-1] == HYPHEN
){
268 *status
= U_IDNA_STD3_ASCII_RULES_ERROR
;
270 /* populate the parseError struct */
272 // failPos is always set the index of failure
273 uprv_syntaxError(b1
,failPos
, b1Len
,parseError
);
274 }else if(b1
[0] == HYPHEN
){
275 // fail position is 0
276 uprv_syntaxError(b1
,0,b1Len
,parseError
);
278 // the last index in the source is always length-1
279 uprv_syntaxError(b1
, (b1Len
>0) ? b1Len
-1 : b1Len
, b1Len
,parseError
);
286 if(b1Len
<= destCapacity
){
287 uprv_memmove(dest
, b1
, b1Len
* U_SIZEOF_UCHAR
);
294 // step 5 : verify the sequence does not begin with ACE prefix
295 if(!startsWithPrefix(b1
,b1Len
)){
297 //step 6: encode the sequence with punycode
299 // do not preserve the case flags for now!
300 // TODO: Preserve the case while implementing the RFE
301 // caseFlags = (UBool*) uprv_malloc(b1Len * sizeof(UBool));
302 // uprv_memset(caseFlags,TRUE,b1Len);
304 b2Len
= u_strToPunycode(b1
,b1Len
,b2
,b2Capacity
,caseFlags
, status
);
306 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
307 // redo processing of string
308 /* we do not have enough room so grow the buffer*/
309 b2
= (UChar
*) uprv_malloc(b2Len
* U_SIZEOF_UCHAR
);
311 *status
= U_MEMORY_ALLOCATION_ERROR
;
315 *status
= U_ZERO_ERROR
; // reset error
317 b2Len
= u_strToPunycode(b1
,b1Len
,b2
,b2Len
,caseFlags
, status
);
320 if(U_FAILURE(*status
)){
323 // TODO : Reconsider while implementing the case preserve RFE
324 // convert all codepoints to lower case ASCII
325 // toASCIILower(b2,b2Len);
326 reqLength
= b2Len
+ACE_PREFIX_LENGTH
;
328 if(reqLength
> destCapacity
){
329 *status
= U_BUFFER_OVERFLOW_ERROR
;
332 //Step 7: prepend the ACE prefix
333 uprv_memcpy(dest
,ACE_PREFIX
,ACE_PREFIX_LENGTH
* U_SIZEOF_UCHAR
);
334 //Step 6: copy the contents in b2 into dest
335 uprv_memcpy(dest
+ACE_PREFIX_LENGTH
, b2
, b2Len
* U_SIZEOF_UCHAR
);
338 *status
= U_IDNA_ACE_PREFIX_ERROR
;
339 //position of failure is 0
340 uprv_syntaxError(b1
,0,b1Len
,parseError
);
345 if(reqLength
> MAX_LABEL_LENGTH
){
346 *status
= U_IDNA_LABEL_TOO_LONG_ERROR
;
356 uprv_free(caseFlags
);
358 return u_terminateUChars(dest
, destCapacity
, reqLength
, status
);
362 _internal_toUnicode(const UChar
* src
, int32_t srcLength
,
363 UChar
* dest
, int32_t destCapacity
,
365 UStringPrepProfile
* nameprep
,
366 UParseError
* parseError
,
370 UBool useSTD3ASCIIRules
= (UBool
)((options
& UIDNA_USE_STD3_RULES
) != 0);
371 int32_t namePrepOptions
= ((options
& UIDNA_ALLOW_UNASSIGNED
) != 0) ? USPREP_ALLOW_UNASSIGNED
: 0;
373 UChar b1Stack
[MAX_LABEL_BUFFER_SIZE
], b2Stack
[MAX_LABEL_BUFFER_SIZE
], b3Stack
[MAX_LABEL_BUFFER_SIZE
];
375 //initialize pointers to stack buffers
376 UChar
*b1
= b1Stack
, *b2
= b2Stack
, *b1Prime
=NULL
, *b3
=b3Stack
;
377 int32_t b1Len
, b2Len
, b1PrimeLen
, b3Len
,
378 b1Capacity
= MAX_LABEL_BUFFER_SIZE
,
379 b2Capacity
= MAX_LABEL_BUFFER_SIZE
,
380 b3Capacity
= MAX_LABEL_BUFFER_SIZE
,
384 UBool
* caseFlags
= NULL
;
386 UBool srcIsASCII
= TRUE
;
387 UBool srcIsLDH
= TRUE
;
390 // step 1: find out if all the codepoints in src are ASCII
393 for(;src
[srcLength
]!=0;){
394 if(src
[srcLength
]> 0x7f){
396 }else if(isLDHChar(src
[srcLength
])==FALSE
){
397 // here we do not assemble surrogates
398 // since we know that LDH code points
399 // are in the ASCII range only
405 }else if(srcLength
> 0){
406 for(int32_t j
=0; j
<srcLength
; j
++){
409 }else if(isLDHChar(src
[j
])==FALSE
){
410 // here we do not assemble surrogates
411 // since we know that LDH code points
412 // are in the ASCII range only
421 if(srcIsASCII
== FALSE
){
422 // step 2: process the string
423 b1Len
= usprep_prepare(nameprep
, src
, srcLength
, b1
, b1Capacity
, namePrepOptions
, parseError
, status
);
424 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
425 // redo processing of string
426 /* we do not have enough room so grow the buffer*/
427 b1
= (UChar
*) uprv_malloc(b1Len
* U_SIZEOF_UCHAR
);
429 *status
= U_MEMORY_ALLOCATION_ERROR
;
433 *status
= U_ZERO_ERROR
; // reset error
435 b1Len
= usprep_prepare(nameprep
, src
, srcLength
, b1
, b1Len
, namePrepOptions
, parseError
, status
);
438 if(U_FAILURE(*status
)){
443 //just point src to b1
448 //step 3: verify ACE Prefix
449 if(startsWithPrefix(src
,srcLength
)){
451 //step 4: Remove the ACE Prefix
452 b1Prime
= b1
+ ACE_PREFIX_LENGTH
;
453 b1PrimeLen
= b1Len
- ACE_PREFIX_LENGTH
;
455 //step 5: Decode using punycode
456 b2Len
= u_strFromPunycode(b1Prime
, b1PrimeLen
, b2
, b2Capacity
, caseFlags
,status
);
458 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
459 // redo processing of string
460 /* we do not have enough room so grow the buffer*/
461 b2
= (UChar
*) uprv_malloc(b2Len
* U_SIZEOF_UCHAR
);
463 *status
= U_MEMORY_ALLOCATION_ERROR
;
467 *status
= U_ZERO_ERROR
; // reset error
469 b2Len
= u_strFromPunycode(b1Prime
, b1PrimeLen
, b2
, b2Len
, caseFlags
, status
);
474 //step 6:Apply toASCII
475 b3Len
= uidna_toASCII(b2
, b2Len
, b3
, b3Capacity
,options
,parseError
, status
);
477 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
478 // redo processing of string
479 /* we do not have enough room so grow the buffer*/
480 b3
= (UChar
*) uprv_malloc(b3Len
* U_SIZEOF_UCHAR
);
482 *status
= U_MEMORY_ALLOCATION_ERROR
;
486 *status
= U_ZERO_ERROR
; // reset error
488 b3Len
= uidna_toASCII(b2
,b2Len
,b3
,b3Len
,options
,parseError
, status
);
492 if(U_FAILURE(*status
)){
497 if(compareCaseInsensitiveASCII(b1
, b1Len
, b3
, b3Len
) !=0){
498 *status
= U_IDNA_VERIFICATION_ERROR
;
502 //step 8: return output of step 5
504 if(b2Len
<= destCapacity
) {
505 uprv_memmove(dest
, b2
, b2Len
* U_SIZEOF_UCHAR
);
508 // verify that STD3 ASCII rules are satisfied
509 if(useSTD3ASCIIRules
== TRUE
){
510 if( srcIsLDH
== FALSE
/* source contains some non-LDH characters */
511 || src
[0] == HYPHEN
|| src
[srcLength
-1] == HYPHEN
){
512 *status
= U_IDNA_STD3_ASCII_RULES_ERROR
;
514 /* populate the parseError struct */
516 // failPos is always set the index of failure
517 uprv_syntaxError(src
,failPos
, srcLength
,parseError
);
518 }else if(src
[0] == HYPHEN
){
519 // fail position is 0
520 uprv_syntaxError(src
,0,srcLength
,parseError
);
522 // the last index in the source is always length-1
523 uprv_syntaxError(src
, (srcLength
>0) ? srcLength
-1 : srcLength
, srcLength
,parseError
);
529 //copy the source to destination
530 if(srcLength
<= destCapacity
){
531 uprv_memmove(dest
,src
,srcLength
* U_SIZEOF_UCHAR
);
533 reqLength
= srcLength
;
538 if(b1
!= b1Stack
&& b1
!=src
){
544 uprv_free(caseFlags
);
547 // The RFC states that
549 // ToUnicode never fails. If any step fails, then the original input
550 // is returned immediately in that step.
552 // So if any step fails lets copy source to destination
553 if(U_FAILURE(*status
)){
554 //copy the source to destination
555 if(dest
&& srcLength
<= destCapacity
){
556 if(srcLength
== -1) {
557 uprv_memmove(dest
,src
,u_strlen(src
)* U_SIZEOF_UCHAR
);
559 uprv_memmove(dest
,src
,srcLength
* U_SIZEOF_UCHAR
);
562 reqLength
= srcLength
;
565 return u_terminateUChars(dest
, destCapacity
, reqLength
, status
);
568 U_CAPI
int32_t U_EXPORT2
569 uidna_toASCII(const UChar
* src
, int32_t srcLength
,
570 UChar
* dest
, int32_t destCapacity
,
572 UParseError
* parseError
,
575 if(status
== NULL
|| U_FAILURE(*status
)){
578 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
579 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
583 UStringPrepProfile
* nameprep
= usprep_open(NULL
,DATA_FILE_NAME
, status
);
585 if(U_FAILURE(*status
)){
589 int32_t retLen
= _internal_toASCII(src
, srcLength
, dest
, destCapacity
, options
, nameprep
, parseError
, status
);
591 /* close the profile*/
592 usprep_close(nameprep
);
597 U_CAPI
int32_t U_EXPORT2
598 uidna_toUnicode(const UChar
* src
, int32_t srcLength
,
599 UChar
* dest
, int32_t destCapacity
,
601 UParseError
* parseError
,
604 if(status
== NULL
|| U_FAILURE(*status
)){
607 if( (src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
608 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
612 UStringPrepProfile
* nameprep
= usprep_open(NULL
, DATA_FILE_NAME
, status
);
614 if(U_FAILURE(*status
)){
618 int32_t retLen
= _internal_toUnicode(src
, srcLength
, dest
, destCapacity
, options
, nameprep
, parseError
, status
);
620 usprep_close(nameprep
);
626 U_CAPI
int32_t U_EXPORT2
627 uidna_IDNToASCII( const UChar
*src
, int32_t srcLength
,
628 UChar
* dest
, int32_t destCapacity
,
630 UParseError
*parseError
,
633 if(status
== NULL
|| U_FAILURE(*status
)){
636 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
637 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
641 int32_t reqLength
= 0;
643 UStringPrepProfile
* nameprep
= usprep_open(NULL
, DATA_FILE_NAME
, status
);
645 if(U_FAILURE(*status
)){
649 //initialize pointers
650 UChar
*delimiter
= (UChar
*)src
;
651 UChar
*labelStart
= (UChar
*)src
;
652 UChar
*currentDest
= (UChar
*) dest
;
653 int32_t remainingLen
= srcLength
;
654 int32_t remainingDestCapacity
= destCapacity
;
655 int32_t labelLen
= 0, labelReqLength
= 0;
661 labelLen
= getNextSeparator(labelStart
,remainingLen
, &delimiter
,&done
);
663 labelReqLength
= _internal_toASCII( labelStart
, labelLen
,
664 currentDest
, remainingDestCapacity
,
668 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
670 *status
= U_ZERO_ERROR
; // reset error
671 remainingDestCapacity
= 0;
675 if(U_FAILURE(*status
)){
679 reqLength
+=labelReqLength
;
680 // adjust the destination pointer
681 if(labelReqLength
< remainingDestCapacity
){
682 currentDest
= currentDest
+ labelReqLength
;
683 remainingDestCapacity
-= labelReqLength
;
685 // should never occur
686 remainingDestCapacity
= 0;
692 // add the label separator
693 if(remainingDestCapacity
> 0){
694 *currentDest
++ = FULL_STOP
;
695 remainingDestCapacity
--;
699 labelStart
= delimiter
;
700 if(remainingLen
>0 ){
701 remainingLen
= srcLength
- (delimiter
- src
);
706 usprep_close(nameprep
);
708 return u_terminateUChars(dest
, destCapacity
, reqLength
, status
);
711 U_CAPI
int32_t U_EXPORT2
712 uidna_IDNToUnicode( const UChar
* src
, int32_t srcLength
,
713 UChar
* dest
, int32_t destCapacity
,
715 UParseError
* parseError
,
718 if(status
== NULL
|| U_FAILURE(*status
)){
721 if((src
==NULL
) || (srcLength
< -1) || (destCapacity
<0) || (!dest
&& destCapacity
> 0)){
722 *status
= U_ILLEGAL_ARGUMENT_ERROR
;
726 int32_t reqLength
= 0;
728 UStringPrepProfile
* nameprep
= usprep_open(NULL
, DATA_FILE_NAME
, status
);
730 if(U_FAILURE(*status
)){
734 //initialize pointers
735 UChar
*delimiter
= (UChar
*)src
;
736 UChar
*labelStart
= (UChar
*)src
;
737 UChar
*currentDest
= (UChar
*) dest
;
738 int32_t remainingLen
= srcLength
;
739 int32_t remainingDestCapacity
= destCapacity
;
740 int32_t labelLen
= 0, labelReqLength
= 0;
746 labelLen
= getNextSeparator(labelStart
,remainingLen
, &delimiter
,&done
);
748 labelReqLength
= _internal_toUnicode(labelStart
, labelLen
,
749 currentDest
, remainingDestCapacity
,
753 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
755 *status
= U_ZERO_ERROR
; // reset error
756 remainingDestCapacity
= 0;
760 if(U_FAILURE(*status
)){
764 reqLength
+=labelReqLength
;
765 // adjust the destination pointer
766 if(labelReqLength
< remainingDestCapacity
){
767 currentDest
= currentDest
+ labelReqLength
;
768 remainingDestCapacity
-= labelReqLength
;
770 // should never occur
771 remainingDestCapacity
= 0;
778 // add the label separator
779 if(remainingDestCapacity
> 0){
780 *currentDest
++ = FULL_STOP
;
781 remainingDestCapacity
--;
785 labelStart
= delimiter
;
786 if(remainingLen
>0 ){
787 remainingLen
= srcLength
- (delimiter
- src
);
792 usprep_close(nameprep
);
794 return u_terminateUChars(dest
, destCapacity
, reqLength
, status
);
797 U_CAPI
int32_t U_EXPORT2
798 uidna_compare( const UChar
*s1
, int32_t length1
,
799 const UChar
*s2
, int32_t length2
,
803 if(status
== NULL
|| U_FAILURE(*status
)){
807 UChar b1Stack
[MAX_IDN_BUFFER_SIZE
], b2Stack
[MAX_IDN_BUFFER_SIZE
];
808 UChar
*b1
= b1Stack
, *b2
= b2Stack
;
809 int32_t b1Len
, b2Len
, b1Capacity
= MAX_IDN_BUFFER_SIZE
, b2Capacity
= MAX_IDN_BUFFER_SIZE
;
812 UParseError parseError
;
814 b1Len
= uidna_IDNToASCII(s1
, length1
, b1
, b1Capacity
, options
, &parseError
, status
);
815 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
816 // redo processing of string
817 b1
= (UChar
*) uprv_malloc(b1Len
* U_SIZEOF_UCHAR
);
819 *status
= U_MEMORY_ALLOCATION_ERROR
;
823 *status
= U_ZERO_ERROR
; // reset error
825 b1Len
= uidna_IDNToASCII(s1
,length1
,b1
,b1Len
, options
, &parseError
, status
);
829 b2Len
= uidna_IDNToASCII(s2
,length2
, b2
,b2Capacity
, options
, &parseError
, status
);
830 if(*status
== U_BUFFER_OVERFLOW_ERROR
){
831 // redo processing of string
832 b2
= (UChar
*) uprv_malloc(b2Len
* U_SIZEOF_UCHAR
);
834 *status
= U_MEMORY_ALLOCATION_ERROR
;
838 *status
= U_ZERO_ERROR
; // reset error
840 b2Len
= uidna_IDNToASCII(s2
, length2
, b2
, b2Len
, options
, &parseError
, status
);
843 // when toASCII is applied all label separators are replaced with FULL_STOP
844 result
= compareCaseInsensitiveASCII(b1
,b1Len
,b2
,b2Len
);
858 #endif /* #if !UCONFIG_NO_IDNA */