]> git.saurik.com Git - apple/icu.git/blob - icuSources/common/ushape.c
ICU-6.2.8.tar.gz
[apple/icu.git] / icuSources / common / ushape.c
1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2000-2004, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: ushape.c
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2000jun29
14 * created by: Markus W. Scherer
15 *
16 * Arabic letter shaping implemented by Ayman Roshdy
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uchar.h"
21 #include "unicode/ustring.h"
22 #include "unicode/ushape.h"
23 #include "cmemory.h"
24 #include "putilimp.h"
25 #include "ustr_imp.h"
26
27 #if UTF_SIZE<16
28 /*
29 * This implementation assumes that the internal encoding is UTF-16
30 * or UTF-32, not UTF-8.
31 * The main assumption is that the Arabic characters and their
32 * presentation forms each fit into a single UChar.
33 * With UTF-8, they occupy 2 or 3 bytes, and more than the ASCII
34 * characters.
35 */
36 # error This implementation assumes UTF-16 or UTF-32 (check UTF_SIZE)
37 #endif
38
39 /*
40 * ### TODO in general for letter shaping:
41 * - the letter shaping code is UTF-16-unaware; needs update
42 * + especially invertBuffer()?!
43 * - needs to handle the "Arabic Tail" that is used in some legacy codepages
44 * as a glyph fragment of wide-glyph letters
45 * + IBM Unicode conversion tables map it to U+200B (ZWSP)
46 * + IBM Egypt has proposed to encode the tail in Unicode among Arabic Presentation Forms
47 */
48
49 /* definitions for Arabic letter shaping ------------------------------------ */
50
51 #define IRRELEVANT 4
52 #define LAMTYPE 16
53 #define ALEFTYPE 32
54 #define LINKR 1
55 #define LINKL 2
56
57 static const UChar IrrelevantPos[] = {
58 0x0, 0x2, 0x4, 0x6,
59 0x8, 0xA, 0xC, 0xE,
60 };
61
62 static const UChar convertLamAlef[] =
63 {
64 /*FEF5*/ 0x0622,
65 /*FEF6*/ 0x0622,
66 /*FEF7*/ 0x0623,
67 /*FEF8*/ 0x0623,
68 /*FEF9*/ 0x0625,
69 /*FEFA*/ 0x0625,
70 /*FEFB*/ 0x0627,
71 /*FEFC*/ 0x0627
72 };
73
74 static const UChar araLink[178]=
75 {
76 1 + 32 + 256 * 0x11,/*0x0622*/
77 1 + 32 + 256 * 0x13,/*0x0623*/
78 1 + 256 * 0x15,/*0x0624*/
79 1 + 32 + 256 * 0x17,/*0x0625*/
80 1 + 2 + 256 * 0x19,/*0x0626*/
81 1 + 32 + 256 * 0x1D,/*0x0627*/
82 1 + 2 + 256 * 0x1F,/*0x0628*/
83 1 + 256 * 0x23,/*0x0629*/
84 1 + 2 + 256 * 0x25,/*0x062A*/
85 1 + 2 + 256 * 0x29,/*0x062B*/
86 1 + 2 + 256 * 0x2D,/*0x062C*/
87 1 + 2 + 256 * 0x31,/*0x062D*/
88 1 + 2 + 256 * 0x35,/*0x062E*/
89 1 + 256 * 0x39,/*0x062F*/
90 1 + 256 * 0x3B,/*0x0630*/
91 1 + 256 * 0x3D,/*0x0631*/
92 1 + 256 * 0x3F,/*0x0632*/
93 1 + 2 + 256 * 0x41,/*0x0633*/
94 1 + 2 + 256 * 0x45,/*0x0634*/
95 1 + 2 + 256 * 0x49,/*0x0635*/
96 1 + 2 + 256 * 0x4D,/*0x0636*/
97 1 + 2 + 256 * 0x51,/*0x0637*/
98 1 + 2 + 256 * 0x55,/*0x0638*/
99 1 + 2 + 256 * 0x59,/*0x0639*/
100 1 + 2 + 256 * 0x5D,/*0x063A*/
101 0, 0, 0, 0, 0, /*0x063B-0x063F*/
102 1 + 2, /*0x0640*/
103 1 + 2 + 256 * 0x61,/*0x0641*/
104 1 + 2 + 256 * 0x65,/*0x0642*/
105 1 + 2 + 256 * 0x69,/*0x0643*/
106 1 + 2 + 16 + 256 * 0x6D,/*0x0644*/
107 1 + 2 + 256 * 0x71,/*0x0645*/
108 1 + 2 + 256 * 0x75,/*0x0646*/
109 1 + 2 + 256 * 0x79,/*0x0647*/
110 1 + 256 * 0x7D,/*0x0648*/
111 1 + 256 * 0x7F,/*0x0649*/
112 1 + 2 + 256 * 0x81,/*0x064A*/
113 4, 4, 4, 4, /*0x064B-0x064E*/
114 4, 4, 4, 4, /*0x064F-0x0652*/
115 4, 4, 4, 0, 0, /*0x0653-0x0657*/
116 0, 0, 0, 0, /*0x0658-0x065B*/
117 1 + 256 * 0x85,/*0x065C*/
118 1 + 256 * 0x87,/*0x065D*/
119 1 + 256 * 0x89,/*0x065E*/
120 1 + 256 * 0x8B,/*0x065F*/
121 0, 0, 0, 0, 0, /*0x0660-0x0664*/
122 0, 0, 0, 0, 0, /*0x0665-0x0669*/
123 0, 0, 0, 0, 0, 0, /*0x066A-0x066F*/
124 4, /*0x0670*/
125 0, /*0x0671*/
126 1 + 32, /*0x0672*/
127 1 + 32, /*0x0673*/
128 0, /*0x0674*/
129 1 + 32, /*0x0675*/
130 1, 1, /*0x0676-0x0677*/
131 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x0678-0x067D*/
132 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x067E-0x0683*/
133 1+2, 1+2, 1+2, 1+2, /*0x0684-0x0687*/
134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x0688-0x0691*/
135 1, 1, 1, 1, 1, 1, 1, 1, /*0x0692-0x0699*/
136 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/
137 1+2, 1+2, 1+2, 1+2, /*0x069A-0x06A3*/
138 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/
139 1+2, 1+2, 1+2, 1+2, /*0x06A4-0x06AD*/
140 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/
141 1+2, 1+2, 1+2, 1+2, /*0x06AE-0x06B7*/
142 1+2, 1+2, 1+2, 1+2, 1+2, 1+2, /*0x06B8-0x06BF*/
143 1+2, 1+2, /*0x06B8-0x06BF*/
144 1, /*0x06C0*/
145 1+2, /*0x06C1*/
146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*0x06C2-0x06CB*/
147 1+2, /*0x06CC*/
148 1, /*0x06CD*/
149 1+2, 1+2, 1+2, 1+2, /*0x06CE-0x06D1*/
150 1, 1 /*0x06D2-0x06D3*/
151 };
152
153 static const UChar presLink[141]=
154 {
155 1 + 2, /*0xFE70*/
156 1 + 2, /*0xFE71*/
157 1 + 2, 0, 1+ 2, 0, 1+ 2, /*0xFE72-0xFE76*/
158 1 + 2, /*0xFE77*/
159 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE78-0xFE81*/
160 1+ 2, 1 + 2, 1+2, 1 + 2, /*0xFE82-0xFE85*/
161 0, 0 + 32, 1 + 32, 0 + 32, /*0xFE86-0xFE89*/
162 1 + 32, 0, 1, 0 + 32, /*0xFE8A-0xFE8D*/
163 1 + 32, 0, 2, 1 + 2, /*0xFE8E-0xFE91*/
164 1, 0 + 32, 1 + 32, 0, /*0xFE92-0xFE95*/
165 2, 1 + 2, 1, 0, /*0xFE96-0xFE99*/
166 1, 0, 2, 1 + 2, /*0xFE9A-0xFE9D*/
167 1, 0, 2, 1 + 2, /*0xFE9E-0xFEA1*/
168 1, 0, 2, 1 + 2, /*0xFEA2-0xFEA5*/
169 1, 0, 2, 1 + 2, /*0xFEA6-0xFEA9*/
170 1, 0, 2, 1 + 2, /*0xFEAA-0xFEAD*/
171 1, 0, 1, 0, /*0xFEAE-0xFEB1*/
172 1, 0, 1, 0, /*0xFEB2-0xFEB5*/
173 1, 0, 2, 1+2, /*0xFEB6-0xFEB9*/
174 1, 0, 2, 1+2, /*0xFEBA-0xFEBD*/
175 1, 0, 2, 1+2, /*0xFEBE-0xFEC1*/
176 1, 0, 2, 1+2, /*0xFEC2-0xFEC5*/
177 1, 0, 2, 1+2, /*0xFEC6-0xFEC9*/
178 1, 0, 2, 1+2, /*0xFECA-0xFECD*/
179 1, 0, 2, 1+2, /*0xFECE-0xFED1*/
180 1, 0, 2, 1+2, /*0xFED2-0xFED5*/
181 1, 0, 2, 1+2, /*0xFED6-0xFED9*/
182 1, 0, 2, 1+2, /*0xFEDA-0xFEDD*/
183 1, 0, 2, 1+2, /*0xFEDE-0xFEE1*/
184 1, 0 + 16, 2 + 16, 1 + 2 +16, /*0xFEE2-0xFEE5*/
185 1 + 16, 0, 2, 1+2, /*0xFEE6-0xFEE9*/
186 1, 0, 2, 1+2, /*0xFEEA-0xFEED*/
187 1, 0, 2, 1+2, /*0xFEEE-0xFEF1*/
188 1, 0, 1, 0, /*0xFEF2-0xFEF5*/
189 1, 0, 2, 1+2, /*0xFEF6-0xFEF9*/
190 1, 0, 1, 0, /*0xFEFA-0xFEFD*/
191 1, 0, 1, 0,
192 1
193 };
194
195 static const UChar convertFEto06[] =
196 {
197 /***********0******1******2******3******4******5******6******7******8******9******A******B******C******D******E******F***/
198 /*FE7*/ 0x64B, 0x64B, 0x64C, 0x64C, 0x64D, 0x64D, 0x64E, 0x64E, 0x64F, 0x64F, 0x650, 0x650, 0x651, 0x651, 0x652, 0x652,
199 /*FE8*/ 0x621, 0x622, 0x622, 0x623, 0x623, 0x624, 0x624, 0x625, 0x625, 0x626, 0x626, 0x626, 0x626, 0x627, 0x627, 0x628,
200 /*FE9*/ 0x628, 0x628, 0x628, 0x629, 0x629, 0x62A, 0x62A, 0x62A, 0x62A, 0x62B, 0x62B, 0x62B, 0x62B, 0x62C, 0x62C, 0x62C,
201 /*FEA*/ 0x62C, 0x62D, 0x62D, 0x62D, 0x62D, 0x62E, 0x62E, 0x62E, 0x62E, 0x62F, 0x62F, 0x630, 0x630, 0x631, 0x631, 0x632,
202 /*FEB*/ 0x632, 0x633, 0x633, 0x633, 0x633, 0x634, 0x634, 0x634, 0x634, 0x635, 0x635, 0x635, 0x635, 0x636, 0x636, 0x636,
203 /*FEC*/ 0x636, 0x637, 0x637, 0x637, 0x637, 0x638, 0x638, 0x638, 0x638, 0x639, 0x639, 0x639, 0x639, 0x63A, 0x63A, 0x63A,
204 /*FED*/ 0x63A, 0x641, 0x641, 0x641, 0x641, 0x642, 0x642, 0x642, 0x642, 0x643, 0x643, 0x643, 0x643, 0x644, 0x644, 0x644,
205 /*FEE*/ 0x644, 0x645, 0x645, 0x645, 0x645, 0x646, 0x646, 0x646, 0x646, 0x647, 0x647, 0x647, 0x647, 0x648, 0x648, 0x649,
206 /*FEF*/ 0x649, 0x64A, 0x64A, 0x64A, 0x64A, 0x65C, 0x65C, 0x65D, 0x65D, 0x65E, 0x65E, 0x65F, 0x65F
207 };
208
209 static const UChar shapeTable[4][4][4]=
210 {
211 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,1} },
212 { {0,0,2,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} },
213 { {0,0,0,0}, {0,0,0,0}, {0,1,0,3}, {0,1,0,3} },
214 { {0,0,1,2}, {0,0,1,2}, {0,1,1,2}, {0,1,1,3} }
215 };
216
217 /*
218 * This function shapes European digits to Arabic-Indic digits
219 * in-place, writing over the input characters.
220 * Since we know that we are only looking for BMP code points,
221 * we can safely just work with code units (again, at least UTF-16).
222 */
223 static void
224 _shapeToArabicDigitsWithContext(UChar *s, int32_t length,
225 UChar digitBase,
226 UBool isLogical, UBool lastStrongWasAL) {
227 int32_t i;
228 UChar c;
229
230 digitBase-=0x30;
231
232 /* the iteration direction depends on the type of input */
233 if(isLogical) {
234 for(i=0; i<length; ++i) {
235 c=s[i];
236 switch(u_charDirection(c)) {
237 case U_LEFT_TO_RIGHT: /* L */
238 case U_RIGHT_TO_LEFT: /* R */
239 lastStrongWasAL=FALSE;
240 break;
241 case U_RIGHT_TO_LEFT_ARABIC: /* AL */
242 lastStrongWasAL=TRUE;
243 break;
244 case U_EUROPEAN_NUMBER: /* EN */
245 if(lastStrongWasAL && (uint32_t)(c-0x30)<10) {
246 s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase was modified above */
247 }
248 break;
249 default :
250 break;
251 }
252 }
253 } else {
254 for(i=length; i>0; /* pre-decrement in the body */) {
255 c=s[--i];
256 switch(u_charDirection(c)) {
257 case U_LEFT_TO_RIGHT: /* L */
258 case U_RIGHT_TO_LEFT: /* R */
259 lastStrongWasAL=FALSE;
260 break;
261 case U_RIGHT_TO_LEFT_ARABIC: /* AL */
262 lastStrongWasAL=TRUE;
263 break;
264 case U_EUROPEAN_NUMBER: /* EN */
265 if(lastStrongWasAL && (uint32_t)(c-0x30)<10) {
266 s[i]=(UChar)(digitBase+c); /* digitBase+(c-0x30) - digitBase was modified above */
267 }
268 break;
269 default :
270 break;
271 }
272 }
273 }
274 }
275
276 /*
277 *Name : invertBuffer
278 *Function : This function inverts the buffer, it's used
279 * in case the user specifies the buffer to be
280 * U_SHAPE_TEXT_DIRECTION_LOGICAL
281 */
282 static void
283 invertBuffer(UChar *buffer,int32_t size,uint32_t options,int32_t *spacesCountl,int32_t *spacesCountr) {
284
285 UChar temp;
286 int32_t i=0,j=0;
287 int32_t lowlimit = 0, highlimit = 0;
288
289 lowlimit = *spacesCountl;
290 highlimit = *spacesCountr;
291
292 for(i=lowlimit,j=size-highlimit-1;i<j;i++,j--) {
293 temp = buffer[i];
294 buffer[i] = buffer[j];
295 buffer[j] = temp;
296 }
297 }
298
299 /*
300 *Name : changeLamAlef
301 *Function : Converts the Alef characters into an equivalent
302 * LamAlef location in the 0x06xx Range, this is an
303 * intermediate stage in the operation of the program
304 * later it'll be converted into the 0xFExx LamAlefs
305 * in the shaping function.
306 */
307 static UChar
308 changeLamAlef(UChar ch) {
309
310 switch(ch) {
311 case 0x0622 :
312 return(0x065C);
313 break;
314 case 0x0623 :
315 return(0x065D);
316 break;
317 case 0x0625 :
318 return(0x065E);
319 break;
320 case 0x0627 :
321 return(0x065F);
322 break;
323 default :
324 return(0);
325 break;
326 }
327 }
328
329 /*
330 *Name : specialChar
331 *Function : Special Arabic characters need special handling in the shapeUnicode
332 * function, this function returns 1 or 2 for these special characters
333 */
334 static int32_t
335 specialChar(UChar ch) {
336
337 if( (ch>0x0621 && ch<0x0626)||(ch==0x0627)||(ch>0x062e && ch<0x0633)||
338 (ch>0x0647 && ch<0x064a)||(ch==0x0629) ) {
339 return (1);
340 }
341 else
342 if( ch>=0x064B && ch<= 0x0652 )
343 return (2);
344 else
345 if( (ch>=0x0653 && ch<= 0x0655) || ch == 0x0670 ||
346 (ch>=0xFE70 && ch<= 0xFE7F) )
347 return (3);
348 else
349 return (0);
350 }
351
352 /*
353 *Name : getLink
354 *Function : Resolves the link between the characters as
355 * Arabic characters have four forms :
356 * Isolated, Initial, Middle and Final Form
357 */
358 static UChar
359 getLink(UChar ch) {
360
361 if(ch >= 0x0622 && ch <= 0x06D3) {
362 return(araLink[ch-0x0622]);
363 } else if(ch == 0x200D) {
364 return(3);
365 } else if(ch >= 0x206D && ch <= 0x206F) {
366 return(4);
367 } else if(ch >= 0xFE70 && ch <= 0xFEFC) {
368 return(presLink[ch-0xFE70]);
369 } else {
370 return(0);
371 }
372 }
373
374 /*
375 *Name : countSpaces
376 *Function : Counts the number of spaces
377 * at each end of the logical buffer
378 */
379 static void
380 countSpaces(UChar *dest,int32_t size,uint32_t options,int32_t *spacesCountl,int32_t *spacesCountr) {
381
382 int32_t i = 0;
383 int32_t countl = 0,countr = 0;
384
385 while(dest[i] == 0x0020) {
386 countl++;
387 i++;
388 }
389 while(dest[size-1] == 0x0020) {
390 countr++;
391 size--;
392 }
393 *spacesCountl = countl;
394 *spacesCountr = countr;
395 }
396
397 /*
398 *Name : isTashkeelChar
399 *Function : Returns 1 for Tashkeel characters else return 0
400 */
401 static int32_t
402 isTashkeelChar(UChar ch) {
403
404 if( ch>=0x064B && ch<= 0x0652 )
405 return (1);
406 else
407 return (0);
408 }
409
410 /*
411 *Name : isAlefChar
412 *Function : Returns 1 for Alef characters else return 0
413 */
414 static int32_t
415 isAlefChar(UChar ch) {
416
417 if( (ch==0x0622)||(ch==0x0623)||(ch==0x0625)||(ch==0x0627) )
418 return (1);
419 else
420 return (0);
421 }
422
423 /*
424 *Name : isLamAlefChar
425 *Function : Returns 1 for LamAlef characters else return 0
426 */
427 static int32_t
428 isLamAlefChar(UChar ch) {
429
430 if( (ch>=0xFEF5)&&(ch<=0xFEFC) )
431 return (1);
432 else
433 return (0);
434 }
435
436 /*
437 *Name : calculateSize
438 *Function : This function calculates the destSize to be used in preflighting
439 * when the destSize is equal to 0
440 */
441 static int32_t
442 calculateSize(const UChar *source, int32_t sourceLength,
443 int32_t destSize,uint32_t options) {
444
445 int32_t i = 0;
446 destSize = sourceLength;
447
448 switch(options&U_SHAPE_LETTERS_MASK) {
449
450 case U_SHAPE_LETTERS_SHAPE :
451 if((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_VISUAL_LTR) {
452 for(i=0;i<sourceLength;i++) {
453 if( (isAlefChar(source[i]))&&(source[i+1]==0x0644) ) {
454 destSize--;
455 }
456 }
457 }
458 else
459 if((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL) {
460 for(i=0;i<sourceLength;i++) {
461 if( (isAlefChar(source[i+1]))&&(source[i]==0x0644) ) {
462 destSize--;
463 }
464 }
465 }
466 break;
467
468 case U_SHAPE_LETTERS_UNSHAPE :
469 for(i=0;i<sourceLength;i++) {
470 if( isLamAlefChar(source[i]) ) {
471 destSize++;
472 }
473 }
474 break;
475
476 default :
477 /* will never occur because of validity checks at the begin of u_shapeArabic */
478 break;
479 }
480
481 return destSize;
482 }
483
484 /*
485 *Name : removeLamAlefSpaces
486 *Function : The shapeUnicode function converts Lam + Alef into LamAlef + space,
487 * this function removes the spaces behind the LamAlefs according to
488 * the options the user specifies, the spaces are removed to the end
489 * of the buffer, or shrink the buffer ab=nd remove spaces for good
490 * or leave the buffer as it is LamAlef + space.
491 */
492 static int32_t
493 removeLamAlefSpaces(UChar *dest, int32_t sourceLength,
494 int32_t destSize,
495 uint32_t options,
496 UErrorCode *pErrorCode) {
497
498 int32_t i = 0, j = 0;
499 int32_t count = 0;
500 UChar *tempbuffer=NULL;
501
502 switch(options&U_SHAPE_LENGTH_MASK) {
503 case U_SHAPE_LENGTH_GROW_SHRINK :
504 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR);
505 /* Test for NULL */
506 if(tempbuffer == NULL) {
507 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
508 return 0;
509 }
510
511 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR);
512
513 i = j = 0;
514 while(i < sourceLength) {
515 if(dest[i] == 0xFFFF) {
516 j--;
517 count++;
518 }
519 else
520 tempbuffer[j] = dest[i];
521 i++;
522 j++;
523 }
524
525 while(count >= 0) {
526 tempbuffer[i] = 0x0000;
527 i--;
528 count--;
529 }
530
531 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR);
532 destSize = u_strlen(dest);
533 break;
534
535 case U_SHAPE_LENGTH_FIXED_SPACES_NEAR :
536 /* Lam+Alef is already shaped into LamAlef + FFFF */
537 i = 0;
538 while(i < sourceLength) {
539 if(dest[i] == 0xFFFF)
540 dest[i] = 0x0020;
541 i++;
542 }
543 destSize = sourceLength;
544 break;
545
546 case U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING :
547 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR);
548
549 /* Test for NULL */
550 if(tempbuffer == NULL) {
551 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
552 return 0;
553 }
554
555 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR);
556
557 i = j = sourceLength;
558 while(i >= 0) {
559 if(dest[i] == 0xFFFF) {
560 j++;
561 count++;
562 }
563 else
564 tempbuffer[j] = dest[i];
565 i--;
566 j--;
567 }
568 for(i=0;i<count;i++)
569 tempbuffer[i] = 0x0020;
570
571 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR);
572 destSize = sourceLength;
573 break;
574
575 case U_SHAPE_LENGTH_FIXED_SPACES_AT_END :
576 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR);
577
578 /* Test for NULL */
579 if(tempbuffer == NULL) {
580 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
581 return 0;
582 }
583
584 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR);
585
586 i = j = 0;
587 while(i < sourceLength) {
588 if(dest[i] == 0xFFFF) {
589 j--;
590 count++;
591 }
592 else
593 tempbuffer[j] = dest[i];
594 i++;
595 j++;
596 }
597
598 while(count >= 0) {
599 tempbuffer[i] = 0x0020;
600 i--;
601 count--;
602 }
603
604 uprv_memcpy(dest,tempbuffer, sourceLength*U_SIZEOF_UCHAR);
605 destSize = sourceLength;
606 break;
607
608 default :
609 /* will not occur */
610 break;
611 }
612
613 if(tempbuffer)
614 uprv_free(tempbuffer);
615
616 return destSize;
617 }
618
619 /*
620 *Name : expandLamAlef
621 *Function : LamAlef needs special handling as the LamAlef is
622 * one character while expanding it will give two
623 * characters Lam + Alef, so we need to expand the LamAlef
624 * in near or far spaces according to the options the user
625 * specifies or increase the buffer size.
626 * If there are no spaces to expand the LamAlef, an error
627 * will be set to U_NO_SPACE_AVAILABLE as defined in utypes.h
628 */
629 static int32_t
630 expandLamAlef(UChar *dest, int32_t sourceLength,
631 int32_t destSize,uint32_t options,
632 UErrorCode *pErrorCode) {
633
634 int32_t i = 0,j = 0;
635 int32_t countl = 0;
636 int32_t countr = 0;
637 int32_t inpsize = sourceLength;
638 UChar lamalefChar;
639 UChar *tempbuffer=NULL;
640
641 switch(options&U_SHAPE_LENGTH_MASK) {
642
643 case U_SHAPE_LENGTH_GROW_SHRINK :
644 destSize = calculateSize(dest,sourceLength,destSize,options);
645 tempbuffer = (UChar *)uprv_malloc((destSize+1)*U_SIZEOF_UCHAR);
646
647 /* Test for NULL */
648 if(tempbuffer == NULL) {
649 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
650 return 0;
651 }
652
653 uprv_memset(tempbuffer, 0, (destSize+1)*U_SIZEOF_UCHAR);
654
655 i = j = 0;
656 while(i < destSize && j < destSize) {
657 if( isLamAlefChar(dest[i]) ) {
658 tempbuffer[j] = convertLamAlef[ dest[i] - 0xFEF5 ];
659 tempbuffer[j+1] = 0x0644;
660 j++;
661 }
662 else
663 tempbuffer[j] = dest[i];
664 i++;
665 j++;
666 }
667
668 uprv_memcpy(dest, tempbuffer, destSize*U_SIZEOF_UCHAR);
669 break;
670
671 case U_SHAPE_LENGTH_FIXED_SPACES_NEAR :
672 for(i = 0;i<sourceLength;i++) {
673 if((dest[i] == 0x0020) && isLamAlefChar(dest[i+1])) {
674 lamalefChar = dest[i+1];
675 dest[i+1] = 0x0644;
676 dest[i] = convertLamAlef[ lamalefChar - 0xFEF5 ];
677 }
678 else
679 if((dest[i] != 0x0020) && isLamAlefChar(dest[i+1])) {
680 *pErrorCode=U_NO_SPACE_AVAILABLE;
681 }
682 }
683 destSize = sourceLength;
684 break;
685
686 case U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING :
687 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR);
688
689 /* Test for NULL */
690 if(tempbuffer == NULL) {
691 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
692 return 0;
693 }
694
695 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR);
696
697 i = 0;
698 while(dest[i] == 0x0020) {
699 countl++;
700 i++;
701 }
702
703 i = j = sourceLength-1;
704 while(i >= 0 && j >= 0) {
705 if( countl>0 && isLamAlefChar(dest[i]) ) {
706 tempbuffer[j] = 0x0644;
707 tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ];
708 j--;
709 countl--;
710 }
711 else {
712 if( countl == 0 && isLamAlefChar(dest[i]) )
713 *pErrorCode=U_NO_SPACE_AVAILABLE;
714 tempbuffer[j] = dest[i];
715 }
716 i--;
717 j--;
718 }
719
720 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR);
721 destSize = sourceLength;
722 break;
723
724 case U_SHAPE_LENGTH_FIXED_SPACES_AT_END :
725 /* LamAlef expansion below is done from right to left to make sure that we consume
726 * the spaces with the LamAlefs as they appear in the visual buffer from right to left
727 */
728 tempbuffer = (UChar *)uprv_malloc((sourceLength+1)*U_SIZEOF_UCHAR);
729
730 /* Test for NULL */
731 if(tempbuffer == NULL) {
732 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
733 return 0;
734 }
735
736 uprv_memset(tempbuffer, 0, (sourceLength+1)*U_SIZEOF_UCHAR);
737
738 while(dest[inpsize-1] == 0x0020) {
739 countr++;
740 inpsize--;
741 }
742
743 i = sourceLength - countr - 1;
744 j = sourceLength - 1;
745
746 while(i >= 0 && j >= 0) {
747 if( countr>0 && isLamAlefChar(dest[i]) ) {
748 tempbuffer[j] = 0x0644;
749 tempbuffer[j-1] = convertLamAlef[ dest[i] - 0xFEF5 ];
750 j--;
751 countr--;
752 }
753 else {
754 if( countr == 0 && isLamAlefChar(dest[i]) )
755 *pErrorCode=U_NO_SPACE_AVAILABLE;
756 tempbuffer[j] = dest[i];
757 }
758 i--;
759 j--;
760 }
761
762 if(countr > 0) {
763 uprv_memcpy(tempbuffer, tempbuffer+countr, sourceLength*U_SIZEOF_UCHAR);
764 if(u_strlen(tempbuffer) < sourceLength) {
765 for(i=sourceLength-1;i>=sourceLength-countr;i--)
766 tempbuffer[i] = 0x0020;
767 }
768 }
769
770 uprv_memcpy(dest, tempbuffer, sourceLength*U_SIZEOF_UCHAR);
771
772 destSize = sourceLength;
773 break;
774
775 default :
776 /* will never occur because of validity checks */
777 break;
778 }
779
780 if(tempbuffer)
781 uprv_free(tempbuffer);
782
783 return destSize;
784 }
785
786 /*
787 *Name : shapeUnicode
788 *Function : Converts an Arabic Unicode buffer in 06xx Range into a shaped
789 * arabic Unicode buffer in FExx Range
790 */
791 static int32_t
792 shapeUnicode(UChar *dest, int32_t sourceLength,
793 int32_t destSize,uint32_t options,
794 UErrorCode *pErrorCode,
795 int tashkeelFlag) {
796
797 int32_t i, iend;
798 int32_t step;
799 int32_t prevPos, lastPos,Nx, Nw;
800 unsigned int Shape;
801 int32_t flag;
802 int32_t lamalef_found = 0;
803 UChar prevLink = 0, lastLink = 0, currLink, nextLink = 0;
804 UChar wLamalef;
805
806 /*
807 * Converts the input buffer from FExx Range into 06xx Range
808 * to make sure that all characters are in the 06xx range
809 * even the lamalef is converted to the special region in
810 * the 06xx range
811 */
812 for (i = 0; i < sourceLength; i++) {
813 UChar inputChar = dest[i];
814 if ( (inputChar >= 0xFE70) && (inputChar <= 0xFEFC)) {
815 dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ;
816 } else {
817 dest[i] = inputChar ;
818 }
819 }
820
821 /* sets the index to the end of the buffer, together with the step point to -1 */
822 i = sourceLength - 1;
823 iend = -1;
824 step = -1;
825
826 /*
827 * This function resolves the link between the characters .
828 * Arabic characters have four forms :
829 * Isolated Form, Initial Form, Middle Form and Final Form
830 */
831 currLink = getLink(dest[i]);
832
833 prevPos = i;
834 lastPos = i;
835 Nx = -2, Nw = 0;
836
837 while (i != iend) {
838 /* If high byte of currLink > 0 then more than one shape */
839 if ((currLink & 0xFF00) > 0 || isTashkeelChar(dest[i])) {
840 Nw = i + step;
841 while (Nx < 0) { /* we need to know about next char */
842 if(Nw == iend) {
843 nextLink = 0;
844 Nx = 3000;
845 } else {
846 nextLink = getLink(dest[Nw]);
847 if((nextLink & IRRELEVANT) == 0) {
848 Nx = Nw;
849 } else {
850 Nw = Nw + step;
851 }
852 }
853 }
854
855 if ( ((currLink & ALEFTYPE) > 0) && ((lastLink & LAMTYPE) > 0) ) {
856 lamalef_found = 1;
857 wLamalef = changeLamAlef(dest[i]); /*get from 0x065C-0x065f */
858 if ( wLamalef != 0) {
859 dest[i] = 0xFFFF; /* The default case is to drop the Alef and replace */
860 dest[lastPos] =wLamalef; /* it by 0xFFFF which is the last character in the */
861 i=lastPos; /* unicode private use area, this is done to make */
862 } /* sure that removeLamAlefSpaces() handles only the */
863 lastLink = prevLink; /* spaces generated during lamalef generation. */
864 currLink = getLink(wLamalef); /* 0xFFFF is added here and is replaced by spaces */
865 } /* in removeLamAlefSpaces() */
866 /*
867 * get the proper shape according to link ability of neighbors
868 * and of character; depends on the order of the shapes
869 * (isolated, initial, middle, final) in the compatibility area
870 */
871 flag = specialChar(dest[i]);
872
873 Shape = shapeTable[nextLink & (LINKR + LINKL)]
874 [lastLink & (LINKR + LINKL)]
875 [currLink & (LINKR + LINKL)];
876
877 if (flag == 1) {
878 Shape = (Shape == 1 || Shape == 3) ? 1 : 0;
879 }
880 else
881 if(flag == 2) {
882 if( (lastLink & LINKL) && (nextLink & LINKR) && (tashkeelFlag == 1) &&
883 dest[i] != 0x064C && dest[i] != 0x064D ) {
884 Shape = 1;
885 if( (nextLink&ALEFTYPE) == ALEFTYPE && (lastLink&LAMTYPE) == LAMTYPE )
886 Shape = 0;
887 }
888 else {
889 Shape = 0;
890 }
891 }
892
893 if(flag == 2) {
894 dest[i] = 0xFE70 + IrrelevantPos[(dest[i] - 0x064B)] + Shape;
895 }
896 else
897 dest[i] = (UChar)(0xFE70 + (currLink >> 8) + Shape);
898 }
899
900 /* move one notch forward */
901 if ((currLink & IRRELEVANT) == 0) {
902 prevLink = lastLink;
903 lastLink = currLink;
904 prevPos = lastPos;
905 lastPos = i;
906 }
907
908 i = i + step;
909 if (i == Nx) {
910 currLink = nextLink;
911 Nx = -2;
912 }
913 else if(i != iend) {
914 currLink = getLink(dest[i]);
915 }
916 }
917
918 /* If there is lamalef in the buffer call expandLamAlef */
919 if(lamalef_found != 0)
920 destSize = removeLamAlefSpaces(dest,sourceLength,destSize,options,pErrorCode);
921 else
922 destSize = sourceLength;
923
924 return destSize;
925 }
926
927 /*
928 *Name : deShapeUnicode
929 *Function : Converts an Arabic Unicode buffer in FExx Range into unshaped
930 * arabic Unicode buffer in 06xx Range
931 */
932 static int32_t
933 deShapeUnicode(UChar *dest, int32_t sourceLength,
934 int32_t destSize,uint32_t options,
935 UErrorCode *pErrorCode) {
936 int32_t i = 0;
937 int32_t lamalef_found = 0;
938
939 /*
940 *This for loop changes the buffer from the Unicode FE range to
941 *the Unicode 06 range
942 */
943 for(i = 0; i < sourceLength; i++) {
944 UChar inputChar = dest[i];
945 if (( inputChar >= 0xFE70) && (inputChar <= 0xFEF4 )) { /* FExx Arabic range */
946 dest[i] = convertFEto06 [ (inputChar - 0xFE70) ] ;
947 } else {
948 dest[i] = inputChar ;
949 }
950 if( isLamAlefChar(dest[i]) )
951 lamalef_found = 1;
952 }
953
954 /* If there is lamalef in the buffer call expandLamAlef */
955 if(lamalef_found != 0)
956 destSize = expandLamAlef(dest,sourceLength,destSize,options,pErrorCode);
957 else
958 destSize = sourceLength;
959
960 return destSize;
961 }
962
963 U_CAPI int32_t U_EXPORT2
964 u_shapeArabic(const UChar *source, int32_t sourceLength,
965 UChar *dest, int32_t destCapacity,
966 uint32_t options,
967 UErrorCode *pErrorCode) {
968
969 int32_t destLength;
970
971 /* usual error checking */
972 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
973 return 0;
974 }
975
976 /* make sure that no reserved options values are used; allow dest==NULL only for preflighting */
977 if( source==NULL || sourceLength<-1 ||
978 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
979 options>=U_SHAPE_DIGIT_TYPE_RESERVED ||
980 (options&U_SHAPE_DIGITS_MASK)>=U_SHAPE_DIGITS_RESERVED
981 ) {
982 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
983 return 0;
984 }
985
986 /* determine the source length */
987 if(sourceLength==-1) {
988 sourceLength=u_strlen(source);
989 }
990 if(sourceLength==0) {
991 return u_terminateUChars(dest, destCapacity, 0, pErrorCode);
992 }
993
994 /* check that source and destination do not overlap */
995 if( dest!=NULL &&
996 ((source<=dest && dest<source+sourceLength) ||
997 (dest<=source && source<dest+destCapacity))
998 ) {
999 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
1000 return 0;
1001 }
1002
1003 if((options&U_SHAPE_LETTERS_MASK)!=U_SHAPE_LETTERS_NOOP) {
1004 UChar buffer[300];
1005 UChar *tempbuffer;
1006 int32_t outputSize, spacesCountl=0, spacesCountr=0;
1007
1008 /* calculate destination size */
1009 /* TODO: do we ever need to do this pure preflighting? */
1010 if((options&U_SHAPE_LENGTH_MASK)==U_SHAPE_LENGTH_GROW_SHRINK) {
1011 outputSize=calculateSize(source,sourceLength,destCapacity,options);
1012 } else {
1013 outputSize=sourceLength;
1014 }
1015 if(outputSize>destCapacity) {
1016 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1017 return outputSize;
1018 }
1019
1020 /*
1021 * need a temporary buffer of size max(outputSize, sourceLength)
1022 * because at first we copy source->temp
1023 */
1024 if(sourceLength>outputSize) {
1025 outputSize=sourceLength;
1026 }
1027
1028 /* Start of Arabic letter shaping part */
1029 if(outputSize<=sizeof(buffer)/U_SIZEOF_UCHAR) {
1030 outputSize=sizeof(buffer)/U_SIZEOF_UCHAR;
1031 tempbuffer=buffer;
1032 } else {
1033 tempbuffer = (UChar *)uprv_malloc(outputSize*U_SIZEOF_UCHAR);
1034
1035 /*Test for NULL*/
1036 if(tempbuffer == NULL) {
1037 *pErrorCode = U_MEMORY_ALLOCATION_ERROR;
1038 return 0;
1039 }
1040 }
1041 uprv_memcpy(tempbuffer, source, sourceLength*U_SIZEOF_UCHAR);
1042 if(sourceLength<outputSize) {
1043 uprv_memset(tempbuffer+sourceLength, 0, (outputSize-sourceLength)*U_SIZEOF_UCHAR);
1044 }
1045
1046 if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL) {
1047 countSpaces(tempbuffer,sourceLength,options,&spacesCountl,&spacesCountr);
1048 invertBuffer(tempbuffer,sourceLength,options,&spacesCountl,&spacesCountr);
1049 }
1050
1051 switch(options&U_SHAPE_LETTERS_MASK) {
1052 case U_SHAPE_LETTERS_SHAPE :
1053 /* Call the shaping function with tashkeel flag == 1 */
1054 destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,1);
1055 break;
1056 case U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED :
1057 /* Call the shaping function with tashkeel flag == 0 */
1058 destLength = shapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode,0);
1059 break;
1060 case U_SHAPE_LETTERS_UNSHAPE :
1061 /* Call the deshaping function */
1062 destLength = deShapeUnicode(tempbuffer,sourceLength,destCapacity,options,pErrorCode);
1063 break;
1064 default :
1065 /* will never occur because of validity checks above */
1066 destLength = 0;
1067 break;
1068 }
1069
1070 /*
1071 * TODO: (markus 2002aug01)
1072 * For as long as we always preflight the outputSize above
1073 * we should U_ASSERT(outputSize==destLength)
1074 * except for the adjustment above before the tempbuffer allocation
1075 */
1076
1077 if((options&U_SHAPE_TEXT_DIRECTION_MASK) == U_SHAPE_TEXT_DIRECTION_LOGICAL) {
1078 countSpaces(tempbuffer,destLength,options,&spacesCountl,&spacesCountr);
1079 invertBuffer(tempbuffer,destLength,options,&spacesCountl,&spacesCountr);
1080 }
1081 uprv_memcpy(dest, tempbuffer, uprv_min(destLength, destCapacity)*U_SIZEOF_UCHAR);
1082
1083 if(tempbuffer!=buffer) {
1084 uprv_free(tempbuffer);
1085 }
1086
1087 if(destLength>destCapacity) {
1088 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1089 return destLength;
1090 }
1091
1092 /* End of Arabic letter shaping part */
1093 } else {
1094 /*
1095 * No letter shaping:
1096 * just make sure the destination is large enough and copy the string.
1097 */
1098 if(destCapacity<sourceLength) {
1099 /* this catches preflighting, too */
1100 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
1101 return sourceLength;
1102 }
1103 uprv_memcpy(dest, source, sourceLength*U_SIZEOF_UCHAR);
1104 destLength=sourceLength;
1105 }
1106
1107 /*
1108 * Perform number shaping.
1109 * With UTF-16 or UTF-32, the length of the string is constant.
1110 * The easiest way to do this is to operate on the destination and
1111 * "shape" the digits in-place.
1112 */
1113 if((options&U_SHAPE_DIGITS_MASK)!=U_SHAPE_DIGITS_NOOP) {
1114 UChar digitBase;
1115 int32_t i;
1116
1117 /* select the requested digit group */
1118 switch(options&U_SHAPE_DIGIT_TYPE_MASK) {
1119 case U_SHAPE_DIGIT_TYPE_AN:
1120 digitBase=0x660; /* Unicode: "Arabic-Indic digits" */
1121 break;
1122 case U_SHAPE_DIGIT_TYPE_AN_EXTENDED:
1123 digitBase=0x6f0; /* Unicode: "Eastern Arabic-Indic digits (Persian and Urdu)" */
1124 break;
1125 default:
1126 /* will never occur because of validity checks above */
1127 digitBase=0;
1128 break;
1129 }
1130
1131 /* perform the requested operation */
1132 switch(options&U_SHAPE_DIGITS_MASK) {
1133 case U_SHAPE_DIGITS_EN2AN:
1134 /* add (digitBase-'0') to each European (ASCII) digit code point */
1135 digitBase-=0x30;
1136 for(i=0; i<destLength; ++i) {
1137 if(((uint32_t)dest[i]-0x30)<10) {
1138 dest[i]+=digitBase;
1139 }
1140 }
1141 break;
1142 case U_SHAPE_DIGITS_AN2EN:
1143 /* subtract (digitBase-'0') from each Arabic digit code point */
1144 for(i=0; i<destLength; ++i) {
1145 if(((uint32_t)dest[i]-(uint32_t)digitBase)<10) {
1146 dest[i]-=digitBase-0x30;
1147 }
1148 }
1149 break;
1150 case U_SHAPE_DIGITS_ALEN2AN_INIT_LR:
1151 _shapeToArabicDigitsWithContext(dest, destLength,
1152 digitBase,
1153 (UBool)((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL),
1154 FALSE);
1155 break;
1156 case U_SHAPE_DIGITS_ALEN2AN_INIT_AL:
1157 _shapeToArabicDigitsWithContext(dest, destLength,
1158 digitBase,
1159 (UBool)((options&U_SHAPE_TEXT_DIRECTION_MASK)==U_SHAPE_TEXT_DIRECTION_LOGICAL),
1160 TRUE);
1161 break;
1162 default:
1163 /* will never occur because of validity checks above */
1164 break;
1165 }
1166 }
1167
1168 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
1169 }