]> git.saurik.com Git - apple/icu.git/blame - icuSources/common/strprep.cpp
ICU-3.13.tar.gz
[apple/icu.git] / icuSources / common / strprep.cpp
CommitLineData
b75a7d8f
A
1/*
2 *******************************************************************************
3 *
4 * Copyright (C) 2003, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: strprep.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2003feb1
14 * created by: Ram Viswanadha
15 */
16
17#include "unicode/utypes.h"
18
19#if !UCONFIG_NO_IDNA
20
21#include "strprep.h"
22#include "utrie.h"
23#include "umutex.h"
24#include "cmemory.h"
25#include "sprpimpl.h"
26#include "nameprep.h"
27#include "ustr_imp.h"
28#include "unicode/unorm.h"
29#include "unicode/udata.h"
30#include "unicode/ustring.h"
31
32static const uint16_t* mappingData = NULL;
33static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
34static UBool _isDataLoaded = FALSE;
35static UTrie idnTrie={ 0,0,0,0,0,0,0 };
36static UDataMemory* idnData=NULL;
37static UErrorCode dataErrorCode =U_ZERO_ERROR;
38/* file definitions */
39static const char DATA_NAME[] = "uidna";
40static const char DATA_TYPE[] = "icu";
41
42U_CFUNC UBool
43ustrprep_cleanup() {
44 if(idnData!=NULL) {
45 udata_close(idnData);
46 idnData=NULL;
47 }
48 dataErrorCode=U_ZERO_ERROR;
49 _isDataLoaded=FALSE;
50
51 return TRUE;
52}
53
54U_CDECL_BEGIN
55static UBool U_CALLCONV
56isAcceptable(void * /* context */,
57 const char * /* type */,
58 const char * /* name */,
59 const UDataInfo *pInfo) {
60 if(
61 pInfo->size>=20 &&
62 pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
63 pInfo->charsetFamily==U_CHARSET_FAMILY &&
64 pInfo->dataFormat[0]==0x49 && /* dataFormat="IDNA" 0x49, 0x44, 0x4e, 0x41 */
65 pInfo->dataFormat[1]==0x44 &&
66 pInfo->dataFormat[2]==0x4e &&
67 pInfo->dataFormat[3]==0x41 &&
68 pInfo->formatVersion[0]==2 &&
69 pInfo->formatVersion[2]==UTRIE_SHIFT &&
70 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
71 ) {
72 return TRUE;
73 } else {
74 return FALSE;
75 }
76}
77
78
79
80static int32_t U_CALLCONV
81getFoldingOffset(uint32_t data) {
82 if(data&0x8000) {
83 return (int32_t)(data&0x7fff);
84 } else {
85 return 0;
86 }
87}
88
89U_CDECL_END
90
91static UBool U_CALLCONV
92loadData(UErrorCode &errorCode) {
93 /* load Unicode IDNA data from file */
94 UBool isCached;
95
96 /* do this because double-checked locking is broken */
97 umtx_lock(NULL);
98 isCached=_isDataLoaded;
99 umtx_unlock(NULL);
100
101 if(!isCached) {
102 UTrie _idnTrie={ 0,0,0,0,0,0,0 };
103 UDataMemory *data;
104 const int32_t *p=NULL;
105 const uint8_t *pb;
106
107 if(&errorCode==NULL || U_FAILURE(errorCode)) {
108 return 0;
109 }
110
111 /* open the data outside the mutex block */
112 //TODO: change the path
113 data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
114 dataErrorCode=errorCode;
115 if(U_FAILURE(errorCode)) {
116 return _isDataLoaded=FALSE;
117 }
118
119 p=(const int32_t *)udata_getMemory(data);
120 pb=(const uint8_t *)(p+_IDNA_INDEX_TOP);
121 utrie_unserialize(&_idnTrie, pb, p[_IDNA_INDEX_TRIE_SIZE], &errorCode);
122 _idnTrie.getFoldingOffset=getFoldingOffset;
123
124
125 if(U_FAILURE(errorCode)) {
126 dataErrorCode=errorCode;
127 udata_close(data);
128 return _isDataLoaded=FALSE;
129 }
130
131 /* in the mutex block, set the data for this process */
132 umtx_lock(NULL);
133 if(idnData==NULL) {
134 idnData=data;
135 data=NULL;
136 uprv_memcpy(&indexes, p, sizeof(indexes));
137 uprv_memcpy(&idnTrie, &_idnTrie, sizeof(UTrie));
138 } else {
139 p=(const int32_t *)udata_getMemory(idnData);
140 }
141 umtx_unlock(NULL);
142 /* initialize some variables */
143 mappingData=(uint16_t *)((uint8_t *)(p+_IDNA_INDEX_TOP)+indexes[_IDNA_INDEX_TRIE_SIZE]);
144
145 _isDataLoaded = TRUE;
146
147 /* if a different thread set it first, then close the extra data */
148 if(data!=NULL) {
149 udata_close(data); /* NULL if it was set correctly */
150 }
151 }
152
153 return _isDataLoaded;
154}
155
156// *****************************************************************************
157// class StringPrep
158// *****************************************************************************
159
160U_NAMESPACE_BEGIN
161
162const char StringPrep::fgClassID=0;
163
164UBool StringPrep::isDataLoaded(UErrorCode& status){
165 if(U_FAILURE(status)){
166 return FALSE;
167 }
168 if(_isDataLoaded==FALSE && U_FAILURE(dataErrorCode)){
169 status = dataErrorCode;
170 return FALSE;
171 }
172 loadData(dataErrorCode);
173 if(U_FAILURE(dataErrorCode)){
174 status = dataErrorCode;
175 return FALSE;
176 }
177 return TRUE;
178}
179
180
181StringPrep* StringPrep::createDefaultInstance(UErrorCode& status){
182 StringPrep* strprep = new StringPrep();
183 if(!isDataLoaded(status)){
184 delete strprep;
185 return NULL;
186 }
187 return strprep;
188}
189
190StringPrep* StringPrep::createNameprepInstance(UErrorCode& status){
191 StringPrep* strprep = new NamePrep(status);
192 if(!isDataLoaded(status)){
193 delete strprep;
194 return NULL;
195 }
196 return strprep;
197}
198
199UBool StringPrep::isNotProhibited(UChar32 /*ch*/){
200 return FALSE;
201}
202UBool StringPrep::isUnassigned(UChar32 ch){
203
204 uint32_t result;
205 UTRIE_GET16(&idnTrie,ch,result);
206 return (result == UIDNA_UNASSIGNED);
207
208}
209
210
211static inline void getValues(uint32_t result, int8_t& flag,
212 int8_t& length, int32_t& index){
213 /* first 3 bits contain the flag */
214 flag = (int8_t) (result & 0x07);
215 /* next 2 bits contain the length */
216 length = (int8_t) ((result>>3) & 0x03);
217 /* next 10 bits contain the index */
218 index = (result>> 5);
219}
220
221
222int32_t StringPrep::map(const UChar* src, int32_t srcLength,
223 UChar* dest, int32_t destCapacity,
224 UBool allowUnassigned,
225 UParseError* parseError,
226 UErrorCode& status ){
227
228 uint32_t result;
229 int8_t flag;
230 int8_t length;
231 int32_t index;
232 int32_t destIndex=0;
233 int32_t srcIndex=0;
234
235 // check error status
236 if(U_FAILURE(status)){
237 return 0;
238 }
239
240 //check arguments
241 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
242 status=U_ILLEGAL_ARGUMENT_ERROR;
243 return 0;
244 }
245 if(srcLength == -1){
246 srcLength = u_strlen(src);
247 }
248
249 for(;srcIndex<srcLength;){
250 UChar32 ch;
251
252 U16_NEXT(src,srcIndex,srcLength,ch);
253
254 UTRIE_GET16(&idnTrie,ch,result);
255
256 getValues(result,flag,length,index);
257
258 // check if the source codepoint is unassigned
259 if(flag == UIDNA_UNASSIGNED){
260 if(allowUnassigned == TRUE){
261 //copy the ch to destination
262 if(ch <= 0xFFFF){
263 if(destIndex < destCapacity ){
264 dest[destIndex] = (UChar)ch;
265 }
266 destIndex++;
267 }else{
268 if(destIndex+1 < destCapacity ){
269 dest[destIndex] = U16_LEAD(ch);
270 dest[destIndex+1] = U16_TRAIL(ch);
271 }
272 destIndex +=2;
273 }
274 }else{
275 uprv_syntaxError(src,srcIndex-U16_LENGTH(ch), srcLength,parseError);
276 status = U_IDNA_UNASSIGNED_CODEPOINT_FOUND_ERROR;
277 return 0;
278 }
279 }else if((flag == UIDNA_MAP_NFKC && doNFKC == TRUE) ||
280 (index == _IDNA_MAP_TO_NOTHING && doNFKC == FALSE)){
281
282 if(length == _IDNA_LENGTH_IN_MAPPING_TABLE){
283 length = (int8_t) mappingData[index++];
284 }
285
286 for(int8_t i =0; i< length; i++){
287 if(destIndex < destCapacity ){
288 dest[destIndex] = mappingData[index+i];
289 }
290 destIndex++; /* for pre-flighting */
291 }
292 }else{
293 //copy the source into destination
294 if(ch <= 0xFFFF){
295 if(destIndex < destCapacity ){
296 dest[destIndex] = (UChar)ch;
297 }
298 destIndex++;
299 }else{
300 if(destIndex+1 < destCapacity ){
301 dest[destIndex] = U16_LEAD(ch);
302 dest[destIndex+1] = U16_TRAIL(ch);
303 }
304 destIndex +=2;
305 }
306 }
307 }
308
309 return u_terminateUChars(dest, destCapacity, destIndex, &status);
310}
311
312
313int32_t StringPrep::normalize( const UChar* src, int32_t srcLength,
314 UChar* dest, int32_t destCapacity,
315 UErrorCode& status ){
316
317 return unorm_normalize(src,srcLength,UNORM_NFKC,UNORM_UNICODE_3_2,dest,destCapacity,&status);
318}
319
320
321 /*
322 1) Map -- For each character in the input, check if it has a mapping
323 and, if so, replace it with its mapping.
324
325 2) Normalize -- Possibly normalize the result of step 1 using Unicode
326 normalization.
327
328 3) Prohibit -- Check for any characters that are not allowed in the
329 output. If any are found, return an error.
330
331 4) Check bidi -- Possibly check for right-to-left characters, and if
332 any are found, make sure that the whole string satisfies the
333 requirements for bidirectional strings. If the string does not
334 satisfy the requirements for bidirectional strings, return an
335 error.
336 [Unicode3.2] defines several bidirectional categories; each character
337 has one bidirectional category assigned to it. For the purposes of
338 the requirements below, an "RandALCat character" is a character that
339 has Unicode bidirectional categories "R" or "AL"; an "LCat character"
340 is a character that has Unicode bidirectional category "L". Note
341
342
343 that there are many characters which fall in neither of the above
344 definitions; Latin digits (<U+0030> through <U+0039>) are examples of
345 this because they have bidirectional category "EN".
346
347 In any profile that specifies bidirectional character handling, all
348 three of the following requirements MUST be met:
349
350 1) The characters in section 5.8 MUST be prohibited.
351
352 2) If a string contains any RandALCat character, the string MUST NOT
353 contain any LCat character.
354
355 3) If a string contains any RandALCat character, a RandALCat
356 character MUST be the first character of the string, and a
357 RandALCat character MUST be the last character of the string.
358*/
359
360#define MAX_STACK_BUFFER_SIZE 300
361
362int32_t StringPrep::process(const UChar* src, int32_t srcLength,
363 UChar* dest, int32_t destCapacity,
364 UBool allowUnassigned,
365 UParseError* parseError,
366 UErrorCode& status ){
367 // check error status
368 if(U_FAILURE(status)){
369 return 0;
370 }
371
372 //check arguments
373 if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
374 status=U_ILLEGAL_ARGUMENT_ERROR;
375 return 0;
376 }
377
378 UChar b1Stack[MAX_STACK_BUFFER_SIZE], b2Stack[MAX_STACK_BUFFER_SIZE];
379 UChar *b1 = b1Stack, *b2 = b2Stack;
380 int32_t b1Len, b2Len=0,
381 b1Capacity = MAX_STACK_BUFFER_SIZE ,
382 b2Capacity = MAX_STACK_BUFFER_SIZE;
383 uint32_t result;
384 int32_t b2Index = 0;
385 int8_t flag;
386 int8_t length;
387 int32_t index;
388 UCharDirection direction=U_CHAR_DIRECTION_COUNT, firstCharDir=U_CHAR_DIRECTION_COUNT;
389 UBool leftToRight=FALSE, rightToLeft=FALSE;
390 int32_t rtlPos =-1, ltrPos =-1;
391
392 b1Len = map(src,srcLength, b1, b1Capacity,allowUnassigned, parseError, status);
393
394 if(status == U_BUFFER_OVERFLOW_ERROR){
395 // redo processing of string
396 /* we do not have enough room so grow the buffer*/
397 b1 = (UChar*) uprv_malloc(b1Len * U_SIZEOF_UCHAR);
398 if(b1==NULL){
399 status = U_MEMORY_ALLOCATION_ERROR;
400 goto CLEANUP;
401 }
402
403 status = U_ZERO_ERROR; // reset error
404
405 b1Len = map(src,srcLength, b1, b1Len,allowUnassigned, parseError, status);
406
407 }
408
409 b2Len = normalize(b1,b1Len, b2,b2Capacity,status);
410
411 if(status == U_BUFFER_OVERFLOW_ERROR){
412 // redo processing of string
413 /* we do not have enough room so grow the buffer*/
414 b2 = (UChar*) uprv_malloc(b2Len * U_SIZEOF_UCHAR);
415 if(b2==NULL){
416 status = U_MEMORY_ALLOCATION_ERROR;
417 goto CLEANUP;
418 }
419
420 status = U_ZERO_ERROR; // reset error
421
422 b2Len = normalize(b2,b2Len, b2,b2Len,status);
423
424 }
425
426 if(U_FAILURE(status)){
427 goto CLEANUP;
428 }
429
430 UChar32 ch;
431
432 for(; b2Index<b2Len;){
433
434 ch = 0;
435
436 U16_NEXT(b2, b2Index, b2Len, ch);
437
438 UTRIE_GET16(&idnTrie,ch,result);
439
440 getValues(result,flag,length,index);
441
442 if(flag == UIDNA_PROHIBITED
443 && isNotProhibited(ch) == FALSE){
444 status = U_IDNA_PROHIBITED_CODEPOINT_FOUND_ERROR;
445 uprv_syntaxError(b1, b2Index-U16_LENGTH(ch), b2Len, parseError);
446 goto CLEANUP;
447 }
448
449 direction = u_charDirection(ch);
450 if(firstCharDir == U_CHAR_DIRECTION_COUNT){
451 firstCharDir = direction;
452 }
453 if(direction == U_LEFT_TO_RIGHT){
454 leftToRight = TRUE;
455 ltrPos = b2Index-1;
456 }
457 if(direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC){
458 rightToLeft = TRUE;
459 rtlPos = b2Index-1;
460 }
461 }
462
463 // satisfy 2
464 if( leftToRight == TRUE && rightToLeft == TRUE){
465 status = U_IDNA_CHECK_BIDI_ERROR;
466 uprv_syntaxError(b2,(rtlPos>ltrPos) ? rtlPos : ltrPos, b2Len, parseError);
467 goto CLEANUP;
468 }
469
470 //satisfy 3
471 if( rightToLeft == TRUE &&
472 !((firstCharDir == U_RIGHT_TO_LEFT || firstCharDir == U_RIGHT_TO_LEFT_ARABIC) &&
473 (direction == U_RIGHT_TO_LEFT || direction == U_RIGHT_TO_LEFT_ARABIC))
474 ){
475 status = U_IDNA_CHECK_BIDI_ERROR;
476 uprv_syntaxError(b2, rtlPos, b2Len, parseError);
477 return FALSE;
478 }
479
480 if(b2Len <= destCapacity){
481 uprv_memmove(dest,b2, b2Len*U_SIZEOF_UCHAR);
482 }
483
484CLEANUP:
485 if(b1!=b1Stack){
486 uprv_free(b1);
487 }
488 if(b2!=b2Stack){
489 uprv_free(b2);
490 }
491 return u_terminateUChars(dest, destCapacity, b2Len, &status);
492}
493
494
495UBool StringPrep::isLabelSeparator(UChar32 ch, UErrorCode& status){
496 // check error status
497 if(U_FAILURE(status)){
498 return FALSE;
499 }
500
501 if(isDataLoaded(status)){
502 int32_t result;
503 UTRIE_GET16(&idnTrie,ch, result);
504 if( (result & 0x07) == UIDNA_LABEL_SEPARATOR){
505 return TRUE;
506 }
507 }
508 return FALSE;
509}
510
511U_NAMESPACE_END
512
513#endif /* #if !UCONFIG_NO_IDNA */