git.saurik.com Git - apple/icu.git/blob - icuSources/common/uts46.cpp

2 // License & terms of use: http://www.unicode.org/copyright.html

3 /*

4 *******************************************************************************

7 *******************************************************************************

8 * file name: uts46.cpp

9 * encoding: UTF-8

10 * tab size: 8 (not used)

11 * indentation:4

12 *

13 * created on: 2010mar09

14 * created by: Markus W. Scherer

15 */

17 #include "unicode/utypes.h"

19 #if !UCONFIG_NO_IDNA

21 #include "unicode/idna.h"

22 #include "unicode/normalizer2.h"

23 #include "unicode/uscript.h"

24 #include "unicode/ustring.h"

25 #include "unicode/utf16.h"

26 #include "cmemory.h"

27 #include "cstring.h"

28 #include "punycode.h"

29 #include "ubidi_props.h"

30 #include "ustr_imp.h"

32 // Note about tests for UIDNA_ERROR_DOMAIN_NAME_TOO_LONG:

33 //

34 // The domain name length limit is 255 octets in an internal DNS representation

35 // where the last ("root") label is the empty label

36 // represented by length byte 0 alone.

37 // In a conventional string, this translates to 253 characters, or 254

38 // if there is a trailing dot for the root label.

40 U_NAMESPACE_BEGIN

42 // Severe errors which usually result in a U+FFFD replacement character in the result string.

43 const uint32_t severeErrors=

44 UIDNA_ERROR_LEADING_COMBINING_MARK|

45 UIDNA_ERROR_DISALLOWED|

46 UIDNA_ERROR_PUNYCODE|

47 UIDNA_ERROR_LABEL_HAS_DOT|

48 UIDNA_ERROR_INVALID_ACE_LABEL;

50 static inline UBool

 isASCIIString(const UnicodeString &dest) {

     const UChar *s=dest.getBuffer();

     const UChar *limit=s+dest.length();

54 while(s<limit) {

         if(*s++>0x7f) {

56 return FALSE;

57 }

58 }

59 return TRUE;

60 }

62 static UBool

 isASCIIOkBiDi(const UChar *s, int32_t length);

65 static UBool

 isASCIIOkBiDi(const char *s, int32_t length);

68 // IDNA class default implementations -------------------------------------- ***

70 IDNA::~IDNA() {}

72 void

 IDNA::labelToASCII_UTF8(StringPiece label, ByteSink &dest,

                         IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_SUCCESS(errorCode)) {

76 UnicodeString destString;

         labelToASCII(UnicodeString::fromUTF8(label), destString,

                      info, errorCode).toUTF8(dest);

79 }

80 }

82 void

 IDNA::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,

                          IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_SUCCESS(errorCode)) {

86 UnicodeString destString;

         labelToUnicode(UnicodeString::fromUTF8(label), destString,

                        info, errorCode).toUTF8(dest);

89 }

90 }

92 void

 IDNA::nameToASCII_UTF8(StringPiece name, ByteSink &dest,

                        IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_SUCCESS(errorCode)) {

96 UnicodeString destString;

         nameToASCII(UnicodeString::fromUTF8(name), destString,

                     info, errorCode).toUTF8(dest);

99 }

100 }

101

102 void

 IDNA::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,

                         IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_SUCCESS(errorCode)) {

106 UnicodeString destString;

         nameToUnicode(UnicodeString::fromUTF8(name), destString,

                       info, errorCode).toUTF8(dest);

109 }

110 }

111

112 // UTS46 class declaration ------------------------------------------------- ***

113

114 class UTS46 : public IDNA {

115 public:

     UTS46(uint32_t options, UErrorCode &errorCode);

117 virtual ~UTS46();

118

119 virtual UnicodeString &

     labelToASCII(const UnicodeString &label, UnicodeString &dest,

                  IDNAInfo &info, UErrorCode &errorCode) const;

122

123 virtual UnicodeString &

     labelToUnicode(const UnicodeString &label, UnicodeString &dest,

                    IDNAInfo &info, UErrorCode &errorCode) const;

126

127 virtual UnicodeString &

     nameToASCII(const UnicodeString &name, UnicodeString &dest,

                 IDNAInfo &info, UErrorCode &errorCode) const;

130

131 virtual UnicodeString &

     nameToUnicode(const UnicodeString &name, UnicodeString &dest,

                   IDNAInfo &info, UErrorCode &errorCode) const;

134

135 virtual void

     labelToASCII_UTF8(StringPiece label, ByteSink &dest,

                       IDNAInfo &info, UErrorCode &errorCode) const;

138

139 virtual void

     labelToUnicodeUTF8(StringPiece label, ByteSink &dest,

                        IDNAInfo &info, UErrorCode &errorCode) const;

142

143 virtual void

     nameToASCII_UTF8(StringPiece name, ByteSink &dest,

                      IDNAInfo &info, UErrorCode &errorCode) const;

146

147 virtual void

     nameToUnicodeUTF8(StringPiece name, ByteSink &dest,

                       IDNAInfo &info, UErrorCode &errorCode) const;

150

151 private:

152 UnicodeString &

     process(const UnicodeString &src,

154 UBool isLabel, UBool toASCII,

155 UnicodeString &dest,

             IDNAInfo &info, UErrorCode &errorCode) const;

157

158 void

159 processUTF8(StringPiece src,

160 UBool isLabel, UBool toASCII,

161 ByteSink &dest,

                 IDNAInfo &info, UErrorCode &errorCode) const;

163

164 UnicodeString &

     processUnicode(const UnicodeString &src,

166 int32_t labelStart, int32_t mappingStart,

167 UBool isLabel, UBool toASCII,

168 UnicodeString &dest,

                    IDNAInfo &info, UErrorCode &errorCode) const;

170

171 // returns the new dest.length()

172 int32_t

     mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,

174 UErrorCode &errorCode) const;

175

176 // returns the new label length

177 int32_t

178 processLabel(UnicodeString &dest,

179 int32_t labelStart, int32_t labelLength,

180 UBool toASCII,

                  IDNAInfo &info, UErrorCode &errorCode) const;

182 int32_t

183 markBadACELabel(UnicodeString &dest,

184 int32_t labelStart, int32_t labelLength,

                     UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const;

186

187 void

     checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const;

189

190 UBool

     isLabelOkContextJ(const UChar *label, int32_t labelLength) const;

192

193 void

     checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const;

195

196 const Normalizer2 &uts46Norm2; // uts46.nrm

197 uint32_t options;

198 };

199

200 IDNA *

 IDNA::createUTS46Instance(uint32_t options, UErrorCode &errorCode) {

     if(U_SUCCESS(errorCode)) {

         IDNA *idna=new UTS46(options, errorCode);

204 if(idna==NULL) {

205 errorCode=U_MEMORY_ALLOCATION_ERROR;

         } else if(U_FAILURE(errorCode)) {

207 delete idna;

208 idna=NULL;

209 }

210 return idna;

211 } else {

212 return NULL;

213 }

214 }

215

216 // UTS46 implementation ---------------------------------------------------- ***

217

 UTS46::UTS46(uint32_t opt, UErrorCode &errorCode)

         : uts46Norm2(*Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, errorCode)),

220 options(opt) {}

221

222 UTS46::~UTS46() {}

223

224 UnicodeString &

 UTS46::labelToASCII(const UnicodeString &label, UnicodeString &dest,

                     IDNAInfo &info, UErrorCode &errorCode) const {

     return process(label, TRUE, TRUE, dest, info, errorCode);

228 }

229

230 UnicodeString &

 UTS46::labelToUnicode(const UnicodeString &label, UnicodeString &dest,

                       IDNAInfo &info, UErrorCode &errorCode) const {

     return process(label, TRUE, FALSE, dest, info, errorCode);

234 }

235

236 UnicodeString &

 UTS46::nameToASCII(const UnicodeString &name, UnicodeString &dest,

                    IDNAInfo &info, UErrorCode &errorCode) const {

     process(name, FALSE, TRUE, dest, info, errorCode);

     if( dest.length()>=254 && (info.errors&UIDNA_ERROR_DOMAIN_NAME_TOO_LONG)==0 &&

241 isASCIIString(dest) &&

         (dest.length()>254 || dest[253]!=0x2e)

243 ) {

244 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

245 }

246 return dest;

247 }

248

249 UnicodeString &

 UTS46::nameToUnicode(const UnicodeString &name, UnicodeString &dest,

                      IDNAInfo &info, UErrorCode &errorCode) const {

     return process(name, FALSE, FALSE, dest, info, errorCode);

253 }

254

255 void

 UTS46::labelToASCII_UTF8(StringPiece label, ByteSink &dest,

                          IDNAInfo &info, UErrorCode &errorCode) const {

     processUTF8(label, TRUE, TRUE, dest, info, errorCode);

259 }

260

261 void

 UTS46::labelToUnicodeUTF8(StringPiece label, ByteSink &dest,

                           IDNAInfo &info, UErrorCode &errorCode) const {

     processUTF8(label, TRUE, FALSE, dest, info, errorCode);

265 }

266

267 void

 UTS46::nameToASCII_UTF8(StringPiece name, ByteSink &dest,

                         IDNAInfo &info, UErrorCode &errorCode) const {

     processUTF8(name, FALSE, TRUE, dest, info, errorCode);

271 }

272

273 void

 UTS46::nameToUnicodeUTF8(StringPiece name, ByteSink &dest,

                          IDNAInfo &info, UErrorCode &errorCode) const {

     processUTF8(name, FALSE, FALSE, dest, info, errorCode);

277 }

278

279 // UTS #46 data for ASCII characters.

280 // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase

281 // and passes through all other ASCII characters.

282 // If UIDNA_USE_STD3_RULES is set, then non-LDH characters are disallowed

283 // using this data.

284 // The ASCII fastpath also uses this data.

285 // Values: -1=disallowed 0==valid 1==mapped (lowercase)

286 static const int8_t asciiData[128]={

     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,

289 // 002D..002E; valid # HYPHEN-MINUS..FULL STOP

     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0, -1,

291 // 0030..0039; valid # DIGIT ZERO..DIGIT NINE

      0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1, -1,

293 // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z

     -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,

      1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1,

296 // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z

     -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,

      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1, -1

299 };

300

301 UnicodeString &

 UTS46::process(const UnicodeString &src,

303 UBool isLabel, UBool toASCII,

304 UnicodeString &dest,

                IDNAInfo &info, UErrorCode &errorCode) const {

306 // uts46Norm2.normalize() would do all of this error checking and setup,

307 // but with the ASCII fastpath we do not always call it, and do not

308 // call it first.

     if(U_FAILURE(errorCode)) {

310 dest.setToBogus();

311 return dest;

312 }

     const UChar *srcArray=src.getBuffer();

     if(&dest==&src || srcArray==NULL) {

315 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

316 dest.setToBogus();

317 return dest;

318 }

319 // Arguments are fine, reset output values.

320 dest.remove();

321 info.reset();

     int32_t srcLength=src.length();

     if(srcLength==0) {

324 info.errors|=UIDNA_ERROR_EMPTY_LABEL;

325 return dest;

326 }

     UChar *destArray=dest.getBuffer(srcLength);

328 if(destArray==NULL) {

329 errorCode=U_MEMORY_ALLOCATION_ERROR;

330 return dest;

331 }

332 // ASCII fastpath

     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

334 int32_t labelStart=0;

335 int32_t i;

     for(i=0;; ++i) {

337 if(i==srcLength) {

338 if(toASCII) {

                 if((i-labelStart)>63) {

340 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

341 }

342 // There is a trailing dot if labelStart==i.

                 if(!isLabel && i>=254 && (i>254 || labelStart<i)) {

344 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

345 }

346 }

347 info.errors|=info.labelErrors;

348 dest.releaseBuffer(i);

349 return dest;

350 }

351 UChar c=srcArray[i];

         if(c>0x7f) {

353 break;

354 }

355 int cData=asciiData[c];

         if(cData>0) {

             destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.

         } else if(cData<0 && disallowNonLDHDot) {

359 break; // Replacing with U+FFFD can be complicated for toASCII.

360 } else {

361 destArray[i]=c;

             if(c==0x2d) {  // hyphen

                 if(i==(labelStart+3) && srcArray[i-1]==0x2d) {

364 // "??--..." is Punycode or forbidden.

365 ++i; // '-' was copied to dest already

366 break;

367 }

368 if(i==labelStart) {

369 // label starts with "-"

370 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;

371 }

                 if((i+1)==srcLength || srcArray[i+1]==0x2e) {

373 // label ends with "-"

374 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;

375 }

             } else if(c==0x2e) {  // dot

377 if(isLabel) {

378 // Replacing with U+FFFD can be complicated for toASCII.

379 ++i; // '.' was copied to dest already

380 break;

381 }

382 if(i==labelStart) {

383 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;

384 }

                 if(toASCII && (i-labelStart)>63) {

386 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

387 }

388 info.errors|=info.labelErrors;

389 info.labelErrors=0;

390 labelStart=i+1;

391 }

392 }

393 }

394 info.errors|=info.labelErrors;

395 dest.releaseBuffer(i);

     processUnicode(src, labelStart, i, isLabel, toASCII, dest, info, errorCode);

     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&

         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(dest.getBuffer(), labelStart)))

399 ) {

400 info.errors|=UIDNA_ERROR_BIDI;

401 }

402 return dest;

403 }

404

405 void

 UTS46::processUTF8(StringPiece src,

407 UBool isLabel, UBool toASCII,

408 ByteSink &dest,

                    IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_FAILURE(errorCode)) {

411 return;

412 }

     const char *srcArray=src.data();

     int32_t srcLength=src.length();

     if(srcArray==NULL && srcLength!=0) {

416 errorCode=U_ILLEGAL_ARGUMENT_ERROR;

417 return;

418 }

419 // Arguments are fine, reset output values.

420 info.reset();

     if(srcLength==0) {

422 info.errors|=UIDNA_ERROR_EMPTY_LABEL;

423 dest.Flush();

424 return;

425 }

426 UnicodeString destString;

427 int32_t labelStart=0;

     if(srcLength<=256) {  // length of stackArray[]

429 // ASCII fastpath

430 char stackArray[256];

431 int32_t destCapacity;

         char *destArray=dest.GetAppendBuffer(srcLength, srcLength+20,

                                              stackArray, UPRV_LENGTHOF(stackArray), &destCapacity);

         UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

435 int32_t i;

         for(i=0;; ++i) {

437 if(i==srcLength) {

438 if(toASCII) {

                     if((i-labelStart)>63) {

440 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

441 }

442 // There is a trailing dot if labelStart==i.

                     if(!isLabel && i>=254 && (i>254 || labelStart<i)) {

444 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

445 }

446 }

447 info.errors|=info.labelErrors;

                 dest.Append(destArray, i);

449 dest.Flush();

450 return;

451 }

452 char c=srcArray[i];

             if((int8_t)c<0) {  // (uint8_t)c>0x7f

454 break;

455 }

             int cData=asciiData[(int)c];  // Cast: gcc warns about indexing with a char.

             if(cData>0) {

                 destArray[i]=c+0x20;  // Lowercase an uppercase ASCII letter.

             } else if(cData<0 && disallowNonLDHDot) {

460 break; // Replacing with U+FFFD can be complicated for toASCII.

461 } else {

462 destArray[i]=c;

                 if(c==0x2d) {  // hyphen

                     if(i==(labelStart+3) && srcArray[i-1]==0x2d) {

465 // "??--..." is Punycode or forbidden.

466 break;

467 }

468 if(i==labelStart) {

469 // label starts with "-"

470 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;

471 }

                     if((i+1)==srcLength || srcArray[i+1]==0x2e) {

473 // label ends with "-"

474 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;

475 }

                 } else if(c==0x2e) {  // dot

477 if(isLabel) {

478 break; // Replacing with U+FFFD can be complicated for toASCII.

479 }

480 if(i==labelStart) {

481 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;

482 }

                     if(toASCII && (i-labelStart)>63) {

484 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

485 }

486 info.errors|=info.labelErrors;

487 info.labelErrors=0;

488 labelStart=i+1;

489 }

490 }

491 }

492 info.errors|=info.labelErrors;

493 // Convert the processed ASCII prefix of the current label to UTF-16.

494 int32_t mappingStart=i-labelStart;

         destString=UnicodeString::fromUTF8(StringPiece(destArray+labelStart, mappingStart));

496 // Output the previous ASCII labels and process the rest of src in UTF-16.

         dest.Append(destArray, labelStart);

         processUnicode(UnicodeString::fromUTF8(StringPiece(src, labelStart)), 0, mappingStart,

499 isLabel, toASCII,

500 destString, info, errorCode);

501 } else {

502 // src is too long for the ASCII fastpath implementation.

         processUnicode(UnicodeString::fromUTF8(src), 0, 0,

504 isLabel, toASCII,

505 destString, info, errorCode);

506 }

     destString.toUTF8(dest);  // calls dest.Flush()

508 if(toASCII && !isLabel) {

509 // length==labelStart==254 means that there is a trailing dot (ok) and

510 // destString is empty (do not index at 253-labelStart).

         int32_t length=labelStart+destString.length();

         if( length>=254 && isASCIIString(destString) &&

513 (length>254 ||

              (labelStart<254 && destString[253-labelStart]!=0x2e))

515 ) {

516 info.errors|=UIDNA_ERROR_DOMAIN_NAME_TOO_LONG;

517 }

518 }

     if( info.isBiDi && U_SUCCESS(errorCode) && (info.errors&severeErrors)==0 &&

         (!info.isOkBiDi || (labelStart>0 && !isASCIIOkBiDi(srcArray, labelStart)))

521 ) {

522 info.errors|=UIDNA_ERROR_BIDI;

523 }

524 }

525

526 UnicodeString &

 UTS46::processUnicode(const UnicodeString &src,

528 int32_t labelStart, int32_t mappingStart,

529 UBool isLabel, UBool toASCII,

530 UnicodeString &dest,

                       IDNAInfo &info, UErrorCode &errorCode) const {

     if(mappingStart==0) {

         uts46Norm2.normalize(src, dest, errorCode);

534 } else {

         uts46Norm2.normalizeSecondAndAppend(dest, src.tempSubString(mappingStart), errorCode);

536 }

     if(U_FAILURE(errorCode)) {

538 return dest;

539 }

540 UBool doMapDevChars=

         toASCII ? (options&UIDNA_NONTRANSITIONAL_TO_ASCII)==0 :

                   (options&UIDNA_NONTRANSITIONAL_TO_UNICODE)==0;

     const UChar *destArray=dest.getBuffer();

     int32_t destLength=dest.length();

545 int32_t labelLimit=labelStart;

546 while(labelLimit<destLength) {

547 UChar c=destArray[labelLimit];

         if(c==0x2e && !isLabel) {

549 int32_t labelLength=labelLimit-labelStart;

             int32_t newLength=processLabel(dest, labelStart, labelLength,

551 toASCII, info, errorCode);

552 info.errors|=info.labelErrors;

553 info.labelErrors=0;

             if(U_FAILURE(errorCode)) {

555 return dest;

556 }

557 destArray=dest.getBuffer();

558 destLength+=newLength-labelLength;

             labelLimit=labelStart+=newLength+1;

560 continue;

         } else if(c<0xdf) {

562 // pass

         } else if(c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {

564 info.isTransDiff=TRUE;

565 if(doMapDevChars) {

                 destLength=mapDevChars(dest, labelStart, labelLimit, errorCode);

                 if(U_FAILURE(errorCode)) {

568 return dest;

569 }

570 destArray=dest.getBuffer();

571 // All deviation characters have been mapped, no need to check for them again.

572 doMapDevChars=FALSE;

573 // Do not increment labelLimit in case c was removed.

574 continue;

575 }

         } else if(U16_IS_SURROGATE(c)) {

             if(U16_IS_SURROGATE_LEAD(c) ?

                     (labelLimit+1)==destLength || !U16_IS_TRAIL(destArray[labelLimit+1]) :

                     labelLimit==labelStart || !U16_IS_LEAD(destArray[labelLimit-1])) {

580 // Map an unpaired surrogate to U+FFFD before normalization so that when

581 // that removes characters we do not turn two unpaired ones into a pair.

582 info.labelErrors|=UIDNA_ERROR_DISALLOWED;

                 dest.setCharAt(labelLimit, 0xfffd);

584 destArray=dest.getBuffer();

585 }

586 }

587 ++labelLimit;

588 }

589 // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)

590 // but not an empty label elsewhere nor a completely empty domain name.

591 // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.

     if(0==labelStart || labelStart<labelLimit) {

         processLabel(dest, labelStart, labelLimit-labelStart,

594 toASCII, info, errorCode);

595 info.errors|=info.labelErrors;

596 }

597 return dest;

598 }

599

600 int32_t

 UTS46::mapDevChars(UnicodeString &dest, int32_t labelStart, int32_t mappingStart,

602 UErrorCode &errorCode) const {

     if(U_FAILURE(errorCode)) {

604 return 0;

605 }

     int32_t length=dest.length();

     UChar *s=dest.getBuffer(dest[mappingStart]==0xdf ? length+1 : length);

608 if(s==NULL) {

609 errorCode=U_MEMORY_ALLOCATION_ERROR;

610 return length;

611 }

     int32_t capacity=dest.getCapacity();

613 UBool didMapDevChars=FALSE;

     int32_t readIndex=mappingStart, writeIndex=mappingStart;

615 do {

616 UChar c=s[readIndex++];

617 switch(c) {

618 case 0xdf:

619 // Map sharp s to ss.

620 didMapDevChars=TRUE;

             s[writeIndex++]=0x73;  // Replace sharp s with first s.

622 // Insert second s and account for possible buffer reallocation.

623 if(writeIndex==readIndex) {

624 if(length==capacity) {

625 dest.releaseBuffer(length);

                     s=dest.getBuffer(length+1);

627 if(s==NULL) {

628 errorCode=U_MEMORY_ALLOCATION_ERROR;

629 return length;

630 }

631 capacity=dest.getCapacity();

632 }

                 u_memmove(s+writeIndex+1, s+writeIndex, length-writeIndex);

634 ++readIndex;

635 }

636 s[writeIndex++]=0x73;

637 ++length;

638 break;

639 case 0x3c2: // Map final sigma to nonfinal sigma.

640 didMapDevChars=TRUE;

641 s[writeIndex++]=0x3c3;

642 break;

643 case 0x200c: // Ignore/remove ZWNJ.

644 case 0x200d: // Ignore/remove ZWJ.

645 didMapDevChars=TRUE;

646 --length;

647 break;

648 default:

649 // Only really necessary if writeIndex was different from readIndex.

650 s[writeIndex++]=c;

651 break;

652 }

     } while(writeIndex<length);

654 dest.releaseBuffer(length);

655 if(didMapDevChars) {

656 // Mapping deviation characters might have resulted in an un-NFC string.

657 // We could use either the NFC or the UTS #46 normalizer.

658 // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.

659 UnicodeString normalized;

         uts46Norm2.normalize(dest.tempSubString(labelStart), normalized, errorCode);

         if(U_SUCCESS(errorCode)) {

             dest.replace(labelStart, 0x7fffffff, normalized);

             if(dest.isBogus()) {

664 errorCode=U_MEMORY_ALLOCATION_ERROR;

665 }

666 return dest.length();

667 }

668 }

669 return length;

670 }

671

672 // Some non-ASCII characters are equivalent to sequences with

673 // non-LDH ASCII characters. To find them:

674 // grep disallowed_STD3_valid IdnaMappingTable.txt (or uts46.txt)

675 static inline UBool

676 isNonASCIIDisallowedSTD3Valid(UChar32 c) {

     return c==0x2260 || c==0x226E || c==0x226F;

678 }

679

680 // Replace the label in dest with the label string, if the label was modified.

681 // If &label==&dest then the label was modified in-place and labelLength

682 // is the new label length, different from label.length().

683 // If &label!=&dest then labelLength==label.length().

684 // Returns labelLength (= the new label length).

685 static int32_t

 replaceLabel(UnicodeString &dest, int32_t destLabelStart, int32_t destLabelLength,

              const UnicodeString &label, int32_t labelLength, UErrorCode &errorCode) {

     if(U_FAILURE(errorCode)) {

689 return 0;

690 }

691 if(&label!=&dest) {

         dest.replace(destLabelStart, destLabelLength, label);

         if(dest.isBogus()) {

694 errorCode=U_MEMORY_ALLOCATION_ERROR;

695 return 0;

696 }

697 }

698 return labelLength;

699 }

700

701 int32_t

 UTS46::processLabel(UnicodeString &dest,

703 int32_t labelStart, int32_t labelLength,

704 UBool toASCII,

                     IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_FAILURE(errorCode)) {

707 return 0;

708 }

709 UnicodeString fromPunycode;

710 UnicodeString *labelString;

     const UChar *label=dest.getBuffer()+labelStart;

712 int32_t destLabelStart=labelStart;

713 int32_t destLabelLength=labelLength;

714 UBool wasPunycode;

     if(labelLength>=4 && label[0]==0x78 && label[1]==0x6e && label[2]==0x2d && label[3]==0x2d) {

716 // Label starts with "xn--", try to un-Punycode it.

717 wasPunycode=TRUE;

         UChar *unicodeBuffer=fromPunycode.getBuffer(-1);  // capacity==-1: most labels should fit

719 if(unicodeBuffer==NULL) {

720 // Should never occur if we used capacity==-1 which uses the internal buffer.

721 errorCode=U_MEMORY_ALLOCATION_ERROR;

722 return labelLength;

723 }

724 UErrorCode punycodeErrorCode=U_ZERO_ERROR;

         int32_t unicodeLength=u_strFromPunycode(label+4, labelLength-4,

726 unicodeBuffer, fromPunycode.getCapacity(),

727 NULL, &punycodeErrorCode);

728 if(punycodeErrorCode==U_BUFFER_OVERFLOW_ERROR) {

             fromPunycode.releaseBuffer(0);

             unicodeBuffer=fromPunycode.getBuffer(unicodeLength);

731 if(unicodeBuffer==NULL) {

732 errorCode=U_MEMORY_ALLOCATION_ERROR;

733 return labelLength;

734 }

735 punycodeErrorCode=U_ZERO_ERROR;

             unicodeLength=u_strFromPunycode(label+4, labelLength-4,

737 unicodeBuffer, fromPunycode.getCapacity(),

738 NULL, &punycodeErrorCode);

739 }

740 fromPunycode.releaseBuffer(unicodeLength);

         if(U_FAILURE(punycodeErrorCode)) {

742 info.labelErrors|=UIDNA_ERROR_PUNYCODE;

             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);

744 }

745 // Check for NFC, and for characters that are not

746 // valid or deviation characters according to the normalizer.

747 // If there is something wrong, then the string will change.

748 // Note that the normalizer passes through non-LDH ASCII and deviation characters.

749 // Deviation characters are ok in Punycode even in transitional processing.

750 // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES

751 // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.

         UBool isValid=uts46Norm2.isNormalized(fromPunycode, errorCode);

         if(U_FAILURE(errorCode)) {

754 return labelLength;

755 }

756 if(!isValid) {

757 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;

             return markBadACELabel(dest, labelStart, labelLength, toASCII, info, errorCode);

759 }

760 labelString=&fromPunycode;

761 label=fromPunycode.getBuffer();

762 labelStart=0;

763 labelLength=fromPunycode.length();

764 } else {

765 wasPunycode=FALSE;

766 labelString=&dest;

767 }

768 // Validity check

     if(labelLength==0) {

770 info.labelErrors|=UIDNA_ERROR_EMPTY_LABEL;

         return replaceLabel(dest, destLabelStart, destLabelLength,

772 *labelString, labelLength, errorCode);

773 }

774 // labelLength>0

     if(labelLength>=4 && label[2]==0x2d && label[3]==0x2d) {

776 // label starts with "??--"

777 info.labelErrors|=UIDNA_ERROR_HYPHEN_3_4;

778 }

     if(label[0]==0x2d) {

780 // label starts with "-"

781 info.labelErrors|=UIDNA_ERROR_LEADING_HYPHEN;

782 }

     if(label[labelLength-1]==0x2d) {

784 // label ends with "-"

785 info.labelErrors|=UIDNA_ERROR_TRAILING_HYPHEN;

786 }

787 // If the label was not a Punycode label, then it was the result of

788 // mapping, normalization and label segmentation.

789 // If the label was in Punycode, then we mapped it again above

790 // and checked its validity.

791 // Now we handle the STD3 restriction to LDH characters (if set)

792 // and we look for U+FFFD which indicates disallowed characters

793 // in a non-Punycode label or U+FFFD itself in a Punycode label.

794 // We also check for dots which can come from the input to a single-label function.

795 // Ok to cast away const because we own the UnicodeString.

796 UChar *s=(UChar *)label;

     const UChar *limit=label+labelLength;

798 UChar oredChars=0;

799 // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.

     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

801 do {

802 UChar c=*s;

         if(c<=0x7f) {

             if(c==0x2e) {

805 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;

806 *s=0xfffd;

             } else if(disallowNonLDHDot && asciiData[c]<0) {

808 info.labelErrors|=UIDNA_ERROR_DISALLOWED;

809 *s=0xfffd;

810 }

811 } else {

812 oredChars|=c;

             if(disallowNonLDHDot && isNonASCIIDisallowedSTD3Valid(c)) {

814 info.labelErrors|=UIDNA_ERROR_DISALLOWED;

815 *s=0xfffd;

             } else if(c==0xfffd) {

817 info.labelErrors|=UIDNA_ERROR_DISALLOWED;

818 }

819 }

820 ++s;

     } while(s<limit);

822 // Check for a leading combining mark after other validity checks

823 // so that we don't report UIDNA_ERROR_DISALLOWED for the U+FFFD from here.

824 UChar32 c;

825 int32_t cpLength=0;

826 // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.

     U16_NEXT_UNSAFE(label, cpLength, c);

     if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {

829 info.labelErrors|=UIDNA_ERROR_LEADING_COMBINING_MARK;

         labelString->replace(labelStart, cpLength, (UChar)0xfffd);

         label=labelString->getBuffer()+labelStart;

832 labelLength+=1-cpLength;

833 if(labelString==&dest) {

834 destLabelLength=labelLength;

835 }

836 }

     if((info.labelErrors&severeErrors)==0) {

838 // Do contextual checks only if we do not have U+FFFD from a severe error

839 // because U+FFFD can make these checks fail.

         if((options&UIDNA_CHECK_BIDI)!=0 && (!info.isBiDi || info.isOkBiDi)) {

             checkLabelBiDi(label, labelLength, info);

842 }

         if( (options&UIDNA_CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&

             !isLabelOkContextJ(label, labelLength)

845 ) {

846 info.labelErrors|=UIDNA_ERROR_CONTEXTJ;

847 }

         if((options&UIDNA_CHECK_CONTEXTO)!=0 && oredChars>=0xb7) {

             checkLabelContextO(label, labelLength, info);

850 }

851 if(toASCII) {

852 if(wasPunycode) {

853 // Leave a Punycode label unchanged if it has no severe errors.

                 if(destLabelLength>63) {

855 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

856 }

857 return destLabelLength;

             } else if(oredChars>=0x80) {

859 // Contains non-ASCII characters.

860 UnicodeString punycode;

                 UChar *buffer=punycode.getBuffer(63);  // 63==maximum DNS label length

862 if(buffer==NULL) {

863 errorCode=U_MEMORY_ALLOCATION_ERROR;

864 return destLabelLength;

865 }

                 buffer[0]=0x78;  // Write "xn--".

                 buffer[1]=0x6e;

                 buffer[2]=0x2d;

                 buffer[3]=0x2d;

                 int32_t punycodeLength=u_strToPunycode(label, labelLength,

                                                       buffer+4, punycode.getCapacity()-4,

872 NULL, &errorCode);

873 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {

874 errorCode=U_ZERO_ERROR;

                     punycode.releaseBuffer(4);

                     buffer=punycode.getBuffer(4+punycodeLength);

877 if(buffer==NULL) {

878 errorCode=U_MEMORY_ALLOCATION_ERROR;

879 return destLabelLength;

880 }

                     punycodeLength=u_strToPunycode(label, labelLength,

                                                   buffer+4, punycode.getCapacity()-4,

883 NULL, &errorCode);

884 }

885 punycodeLength+=4;

886 punycode.releaseBuffer(punycodeLength);

                 if(U_FAILURE(errorCode)) {

888 return destLabelLength;

889 }

                 if(punycodeLength>63) {

891 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

892 }

                 return replaceLabel(dest, destLabelStart, destLabelLength,

894 punycode, punycodeLength, errorCode);

895 } else {

896 // all-ASCII label

                 if(labelLength>63) {

898 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

899 }

900 }

901 }

902 } else {

903 // If a Punycode label has severe errors,

904 // then leave it but make sure it does not look valid.

905 if(wasPunycode) {

906 info.labelErrors|=UIDNA_ERROR_INVALID_ACE_LABEL;

             return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info, errorCode);

908 }

909 }

     return replaceLabel(dest, destLabelStart, destLabelLength,

911 *labelString, labelLength, errorCode);

912 }

913

914 // Make sure an ACE label does not look valid.

915 // Append U+FFFD if the label has only LDH characters.

916 // If UIDNA_USE_STD3_RULES, also replace disallowed ASCII characters with U+FFFD.

917 int32_t

 UTS46::markBadACELabel(UnicodeString &dest,

919 int32_t labelStart, int32_t labelLength,

                        UBool toASCII, IDNAInfo &info, UErrorCode &errorCode) const {

     if(U_FAILURE(errorCode)) {

922 return 0;

923 }

     UBool disallowNonLDHDot=(options&UIDNA_USE_STD3_RULES)!=0;

925 UBool isASCII=TRUE;

926 UBool onlyLDH=TRUE;

     const UChar *label=dest.getBuffer()+labelStart;

928 // Ok to cast away const because we own the UnicodeString.

     UChar *s=(UChar *)label+4;  // After the initial "xn--".

     const UChar *limit=label+labelLength;

931 do {

932 UChar c=*s;

         if(c<=0x7f) {

             if(c==0x2e) {

935 info.labelErrors|=UIDNA_ERROR_LABEL_HAS_DOT;

936 *s=0xfffd;

937 isASCII=onlyLDH=FALSE;

             } else if(asciiData[c]<0) {

939 onlyLDH=FALSE;

940 if(disallowNonLDHDot) {

941 *s=0xfffd;

942 isASCII=FALSE;

943 }

944 }

945 } else {

946 isASCII=onlyLDH=FALSE;

947 }

     } while(++s<limit);

949 if(onlyLDH) {

         dest.insert(labelStart+labelLength, (UChar)0xfffd);

         if(dest.isBogus()) {

952 errorCode=U_MEMORY_ALLOCATION_ERROR;

953 return 0;

954 }

955 ++labelLength;

956 } else {

         if(toASCII && isASCII && labelLength>63) {

958 info.labelErrors|=UIDNA_ERROR_LABEL_TOO_LONG;

959 }

960 }

961 return labelLength;

962 }

963

 const uint32_t L_MASK=U_MASK(U_LEFT_TO_RIGHT);

 const uint32_t R_AL_MASK=U_MASK(U_RIGHT_TO_LEFT)|U_MASK(U_RIGHT_TO_LEFT_ARABIC);

966 const uint32_t L_R_AL_MASK=L_MASK|R_AL_MASK;

967

 const uint32_t R_AL_AN_MASK=R_AL_MASK|U_MASK(U_ARABIC_NUMBER);

969

 const uint32_t EN_AN_MASK=U_MASK(U_EUROPEAN_NUMBER)|U_MASK(U_ARABIC_NUMBER);

971 const uint32_t R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;

 const uint32_t L_EN_MASK=L_MASK|U_MASK(U_EUROPEAN_NUMBER);

973

974 const uint32_t ES_CS_ET_ON_BN_NSM_MASK=

975 U_MASK(U_EUROPEAN_NUMBER_SEPARATOR)|

976 U_MASK(U_COMMON_NUMBER_SEPARATOR)|

977 U_MASK(U_EUROPEAN_NUMBER_TERMINATOR)|

978 U_MASK(U_OTHER_NEUTRAL)|

979 U_MASK(U_BOUNDARY_NEUTRAL)|

980 U_MASK(U_DIR_NON_SPACING_MARK);

981 const uint32_t L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;

 const uint32_t R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;

983

984 // We scan the whole label and check both for whether it contains RTL characters

985 // and whether it passes the BiDi Rule.

986 // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find

987 // that a domain name is a BiDi domain name (has an RTL label) only after

988 // processing several earlier labels.

989 void

 UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) const {

991 // IDNA2008 BiDi rule

992 // Get the directionality of the first character.

993 UChar32 c;

994 int32_t i=0;

     U16_NEXT_UNSAFE(label, i, c);

     uint32_t firstMask=U_MASK(u_charDirection(c));

997 // 1. The first character must be a character with BIDI property L, R

998 // or AL. If it has the R or AL property, it is an RTL label; if it

999 // has the L property, it is an LTR label.

     if((firstMask&~L_R_AL_MASK)!=0) {

1001 info.isOkBiDi=FALSE;

1002 }

1003 // Get the directionality of the last non-NSM character.

1004 uint32_t lastMask;

1005 for(;;) {

1006 if(i>=labelLength) {

1007 lastMask=firstMask;

1008 break;

1009 }

         U16_PREV_UNSAFE(label, labelLength, c);

1011 UCharDirection dir=u_charDirection(c);

1012 if(dir!=U_DIR_NON_SPACING_MARK) {

1013 lastMask=U_MASK(dir);

1014 break;

1015 }

1016 }

1017 // 3. In an RTL label, the end of the label must be a character with

1018 // BIDI property R, AL, EN or AN, followed by zero or more

1019 // characters with BIDI property NSM.

1020 // 6. In an LTR label, the end of the label must be a character with

1021 // BIDI property L or EN, followed by zero or more characters with

1022 // BIDI property NSM.

     if( (firstMask&L_MASK)!=0 ?

             (lastMask&~L_EN_MASK)!=0 :

1025 (lastMask&~R_AL_EN_AN_MASK)!=0

1026 ) {

1027 info.isOkBiDi=FALSE;

1028 }

1029 // Add the directionalities of the intervening characters.

1030 uint32_t mask=firstMask|lastMask;

1031 while(i<labelLength) {

         U16_NEXT_UNSAFE(label, i, c);

         mask|=U_MASK(u_charDirection(c));

1034 }

1035 if(firstMask&L_MASK) {

1036 // 5. In an LTR label, only characters with the BIDI properties L, EN,

1037 // ES, CS, ET, ON, BN and NSM are allowed.

         if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {

1039 info.isOkBiDi=FALSE;

1040 }

1041 } else {

1042 // 2. In an RTL label, only characters with the BIDI properties R, AL,

1043 // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.

         if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {

1045 info.isOkBiDi=FALSE;

1046 }

1047 // 4. In an RTL label, if an EN is present, no AN may be present, and

1048 // vice versa.

         if((mask&EN_AN_MASK)==EN_AN_MASK) {

1050 info.isOkBiDi=FALSE;

1051 }

1052 }

1053 // An RTL label is a label that contains at least one character of type

1054 // R, AL or AN. [...]

1055 // A "BIDI domain name" is a domain name that contains at least one RTL

1056 // label. [...]

1057 // The following rule, consisting of six conditions, applies to labels

1058 // in BIDI domain names.

     if((mask&R_AL_AN_MASK)!=0) {

1060 info.isBiDi=TRUE;

1061 }

1062 }

1063

1064 // Special code for the ASCII prefix of a BiDi domain name.

1065 // The ASCII prefix is all-LTR.

1066

1067 // IDNA2008 BiDi rule, parts relevant to ASCII labels:

1068 // 1. The first character must be a character with BIDI property L [...]

1069 // 5. In an LTR label, only characters with the BIDI properties L, EN,

1070 // ES, CS, ET, ON, BN and NSM are allowed.

1071 // 6. In an LTR label, the end of the label must be a character with

1072 // BIDI property L or EN [...]

1073

1074 // UTF-16 version, called for mapped ASCII prefix.

1075 // Cannot contain uppercase A-Z.

1076 // s[length-1] must be the trailing dot.

1077 static UBool

 isASCIIOkBiDi(const UChar *s, int32_t length) {

1079 int32_t labelStart=0;

     for(int32_t i=0; i<length; ++i) {

1081 UChar c=s[i];

         if(c==0x2e) {  // dot

1083 if(i>labelStart) {

                 c=s[i-1];

                 if(!(0x61<=c && c<=0x7a) && !(0x30<=c && c<=0x39)) {

1086 // Last character in the label is not an L or EN.

1087 return FALSE;

1088 }

1089 }

1090 labelStart=i+1;

         } else if(i==labelStart) {

             if(!(0x61<=c && c<=0x7a)) {

1093 // First character in the label is not an L.

1094 return FALSE;

1095 }

1096 } else {

             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {

1098 // Intermediate character in the label is a B, S or WS.

1099 return FALSE;

1100 }

1101 }

1102 }

1103 return TRUE;

1104 }

1105

1106 // UTF-8 version, called for source ASCII prefix.

1107 // Can contain uppercase A-Z.

1108 // s[length-1] must be the trailing dot.

1109 static UBool

 isASCIIOkBiDi(const char *s, int32_t length) {

1111 int32_t labelStart=0;

     for(int32_t i=0; i<length; ++i) {

1113 char c=s[i];

         if(c==0x2e) {  // dot

1115 if(i>labelStart) {

                 c=s[i-1];

                 if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a) && !(0x30<=c && c<=0x39)) {

1118 // Last character in the label is not an L or EN.

1119 return FALSE;

1120 }

1121 }

1122 labelStart=i+1;

         } else if(i==labelStart) {

             if(!(0x61<=c && c<=0x7a) && !(0x41<=c && c<=0x5a)) {

1125 // First character in the label is not an L.

1126 return FALSE;

1127 }

1128 } else {

             if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {

1130 // Intermediate character in the label is a B, S or WS.

1131 return FALSE;

1132 }

1133 }

1134 }

1135 return TRUE;

1136 }

1137

1138 UBool

 UTS46::isLabelOkContextJ(const UChar *label, int32_t labelLength) const {

1140 // [IDNA2008-Tables]

1141 // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER

     for(int32_t i=0; i<labelLength; ++i) {

         if(label[i]==0x200c) {

1144 // Appendix A.1. ZERO WIDTH NON-JOINER

1145 // Rule Set:

1146 // False;

1147 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;

1148 // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C

1149 // (Joining_Type:T)*(Joining_Type:{R,D})) Then True;

             if(i==0) {

1151 return FALSE;

1152 }

1153 UChar32 c;

1154 int32_t j=i;

             U16_PREV_UNSAFE(label, j, c);

             if(uts46Norm2.getCombiningClass(c)==9) {

1157 continue;

1158 }

1159 // check precontext (Joining_Type:{L,D})(Joining_Type:T)*

1160 for(;;) {

1161 UJoiningType type=ubidi_getJoiningType(c);

1162 if(type==U_JT_TRANSPARENT) {

                     if(j==0) {

1164 return FALSE;

1165 }

                     U16_PREV_UNSAFE(label, j, c);

                 } else if(type==U_JT_LEFT_JOINING || type==U_JT_DUAL_JOINING) {

1168 break; // precontext fulfilled

1169 } else {

1170 return FALSE;

1171 }

1172 }

1173 // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})

             for(j=i+1;;) {

1175 if(j==labelLength) {

1176 return FALSE;

1177 }

                 U16_NEXT_UNSAFE(label, j, c);

1179 UJoiningType type=ubidi_getJoiningType(c);

1180 if(type==U_JT_TRANSPARENT) {

1181 // just skip this character

                 } else if(type==U_JT_RIGHT_JOINING || type==U_JT_DUAL_JOINING) {

1183 break; // postcontext fulfilled

1184 } else {

1185 return FALSE;

1186 }

1187 }

         } else if(label[i]==0x200d) {

1189 // Appendix A.2. ZERO WIDTH JOINER (U+200D)

1190 // Rule Set:

1191 // False;

1192 // If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;

             if(i==0) {

1194 return FALSE;

1195 }

1196 UChar32 c;

1197 int32_t j=i;

             U16_PREV_UNSAFE(label, j, c);

             if(uts46Norm2.getCombiningClass(c)!=9) {

1200 return FALSE;

1201 }

1202 }

1203 }

1204 return TRUE;

1205 }

1206

1207 void

 UTS46::checkLabelContextO(const UChar *label, int32_t labelLength, IDNAInfo &info) const {

     int32_t labelEnd=labelLength-1;  // inclusive

     int32_t arabicDigits=0;  // -1 for 066x, +1 for 06Fx

     for(int32_t i=0; i<=labelEnd; ++i) {

1212 UChar32 c=label[i];

         if(c<0xb7) {

1214 // ASCII fastpath

         } else if(c<=0x6f9) {

             if(c==0xb7) {

1217 // Appendix A.3. MIDDLE DOT (U+00B7)

1218 // Rule Set:

1219 // False;

1220 // If Before(cp) .eq. U+006C And

1221 // After(cp) .eq. U+006C Then True;

                 if(!(0<i && label[i-1]==0x6c &&

                      i<labelEnd && label[i+1]==0x6c)) {

1224 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;

1225 }

             } else if(c==0x375) {

1227 // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)

1228 // Rule Set:

1229 // False;

1230 // If Script(After(cp)) .eq. Greek Then True;

1231 UScriptCode script=USCRIPT_INVALID_CODE;

1232 if(i<labelEnd) {

1233 UErrorCode errorCode=U_ZERO_ERROR;

                     int32_t j=i+1;

                     U16_NEXT(label, j, labelLength, c);

                     script=uscript_getScript(c, &errorCode);

1237 }

1238 if(script!=USCRIPT_GREEK) {

1239 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;

1240 }

             } else if(c==0x5f3 || c==0x5f4) {

1242 // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)

1243 // Rule Set:

1244 // False;

1245 // If Script(Before(cp)) .eq. Hebrew Then True;

1246 //

1247 // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)

1248 // Rule Set:

1249 // False;

1250 // If Script(Before(cp)) .eq. Hebrew Then True;

1251 UScriptCode script=USCRIPT_INVALID_CODE;

                 if(0<i) {

1253 UErrorCode errorCode=U_ZERO_ERROR;

1254 int32_t j=i;

                     U16_PREV(label, 0, j, c);

                     script=uscript_getScript(c, &errorCode);

1257 }

1258 if(script!=USCRIPT_HEBREW) {

1259 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;

1260 }

             } else if(0x660<=c /* && c<=0x6f9 */) {

1262 // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)

1263 // Rule Set:

1264 // True;

1265 // For All Characters:

1266 // If cp .in. 06F0..06F9 Then False;

1267 // End For;

1268 //

1269 // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)

1270 // Rule Set:

1271 // True;

1272 // For All Characters:

1273 // If cp .in. 0660..0669 Then False;

1274 // End For;

                 if(c<=0x669) {

                     if(arabicDigits>0) {

1277 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;

1278 }

1279 arabicDigits=-1;

                 } else if(0x6f0<=c) {

                     if(arabicDigits<0) {

1282 info.labelErrors|=UIDNA_ERROR_CONTEXTO_DIGITS;

1283 }

1284 arabicDigits=1;

1285 }

1286 }

         } else if(c==0x30fb) {

1288 // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)

1289 // Rule Set:

1290 // False;

1291 // For All Characters:

1292 // If Script(cp) .in. {Hiragana, Katakana, Han} Then True;

1293 // End For;

1294 UErrorCode errorCode=U_ZERO_ERROR;

             for(int j=0;;) {

1296 if(j>labelEnd) {

1297 info.labelErrors|=UIDNA_ERROR_CONTEXTO_PUNCTUATION;

1298 break;

1299 }

                 U16_NEXT(label, j, labelLength, c);

                 UScriptCode script=uscript_getScript(c, &errorCode);

                 if(script==USCRIPT_HIRAGANA || script==USCRIPT_KATAKANA || script==USCRIPT_HAN) {

1303 break;

1304 }

1305 }

1306 }

1307 }

1308 }

1309

1310 U_NAMESPACE_END

1311

1312 // C API ------------------------------------------------------------------- ***

1313

1314 U_NAMESPACE_USE

1315

1316 U_CAPI UIDNA * U_EXPORT2

 uidna_openUTS46(uint32_t options, UErrorCode *pErrorCode) {

     return reinterpret_cast<UIDNA *>(IDNA::createUTS46Instance(options, *pErrorCode));

1319 }

1320

1321 U_CAPI void U_EXPORT2

1322 uidna_close(UIDNA *idna) {

1323 delete reinterpret_cast<IDNA *>(idna);

1324 }

1325

1326 static UBool

 checkArgs(const void *label, int32_t length,

           void *dest, int32_t capacity,

1329 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(U_FAILURE(*pErrorCode)) {

1331 return FALSE;

1332 }

1333 // sizeof(UIDNAInfo)=16 in the first API version.

     if(pInfo==NULL || pInfo->size<16) {

1335 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

1336 return FALSE;

1337 }

     if( (label==NULL ? length!=0 : length<-1) ||

         (dest==NULL ? capacity!=0 : capacity<0) ||

         (dest==label && label!=NULL)

1341 ) {

1342 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

1343 return FALSE;

1344 }

1345 // Set all *pInfo bytes to 0 except for the size field itself.

     uprv_memset(&pInfo->size+1, 0, pInfo->size-sizeof(pInfo->size));

1347 return TRUE;

1348 }

1349

1350 static void

 idnaInfoToStruct(IDNAInfo &info, UIDNAInfo *pInfo) {

     pInfo->isTransitionalDifferent=info.isTransitionalDifferent();

     pInfo->errors=info.getErrors();

1354 }

1355

1356 U_CAPI int32_t U_EXPORT2

 uidna_labelToASCII(const UIDNA *idna,

                    const UChar *label, int32_t length,

1359 UChar *dest, int32_t capacity,

1360 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

1362 return 0;

1363 }

     UnicodeString src((UBool)(length<0), label, length);

     UnicodeString destString(dest, 0, capacity);

1366 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->labelToASCII(src, destString, info, *pErrorCode);

1368 idnaInfoToStruct(info, pInfo);

     return destString.extract(dest, capacity, *pErrorCode);

1370 }

1371

1372 U_CAPI int32_t U_EXPORT2

 uidna_labelToUnicode(const UIDNA *idna,

                      const UChar *label, int32_t length,

1375 UChar *dest, int32_t capacity,

1376 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

1378 return 0;

1379 }

     UnicodeString src((UBool)(length<0), label, length);

     UnicodeString destString(dest, 0, capacity);

1382 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->labelToUnicode(src, destString, info, *pErrorCode);

1384 idnaInfoToStruct(info, pInfo);

     return destString.extract(dest, capacity, *pErrorCode);

1386 }

1387

1388 U_CAPI int32_t U_EXPORT2

 uidna_nameToASCII(const UIDNA *idna,

                   const UChar *name, int32_t length,

1391 UChar *dest, int32_t capacity,

1392 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

1394 return 0;

1395 }

     UnicodeString src((UBool)(length<0), name, length);

     UnicodeString destString(dest, 0, capacity);

1398 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->nameToASCII(src, destString, info, *pErrorCode);

1400 idnaInfoToStruct(info, pInfo);

     return destString.extract(dest, capacity, *pErrorCode);

1402 }

1403

1404 U_CAPI int32_t U_EXPORT2

 uidna_nameToUnicode(const UIDNA *idna,

                     const UChar *name, int32_t length,

1407 UChar *dest, int32_t capacity,

1408 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

1410 return 0;

1411 }

     UnicodeString src((UBool)(length<0), name, length);

     UnicodeString destString(dest, 0, capacity);

1414 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->nameToUnicode(src, destString, info, *pErrorCode);

1416 idnaInfoToStruct(info, pInfo);

     return destString.extract(dest, capacity, *pErrorCode);

1418 }

1419

1420 U_CAPI int32_t U_EXPORT2

 uidna_labelToASCII_UTF8(const UIDNA *idna,

                         const char *label, int32_t length,

                         char *dest, int32_t capacity,

1424 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

1426 return 0;

1427 }

     StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);

1429 CheckedArrayByteSink sink(dest, capacity);

1430 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->labelToASCII_UTF8(src, sink, info, *pErrorCode);

1432 idnaInfoToStruct(info, pInfo);

     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);

1434 }

1435

1436 U_CAPI int32_t U_EXPORT2

 uidna_labelToUnicodeUTF8(const UIDNA *idna,

                          const char *label, int32_t length,

                          char *dest, int32_t capacity,

1440 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(label, length, dest, capacity, pInfo, pErrorCode)) {

1442 return 0;

1443 }

     StringPiece src(label, length<0 ? static_cast<int32_t>(uprv_strlen(label)) : length);

1445 CheckedArrayByteSink sink(dest, capacity);

1446 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->labelToUnicodeUTF8(src, sink, info, *pErrorCode);

1448 idnaInfoToStruct(info, pInfo);

     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);

1450 }

1451

1452 U_CAPI int32_t U_EXPORT2

 uidna_nameToASCII_UTF8(const UIDNA *idna,

                        const char *name, int32_t length,

                        char *dest, int32_t capacity,

1456 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

1458 return 0;

1459 }

     StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);

1461 CheckedArrayByteSink sink(dest, capacity);

1462 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->nameToASCII_UTF8(src, sink, info, *pErrorCode);

1464 idnaInfoToStruct(info, pInfo);

     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);

1466 }

1467

1468 U_CAPI int32_t U_EXPORT2

 uidna_nameToUnicodeUTF8(const UIDNA *idna,

                         const char *name, int32_t length,

                         char *dest, int32_t capacity,

1472 UIDNAInfo *pInfo, UErrorCode *pErrorCode) {

     if(!checkArgs(name, length, dest, capacity, pInfo, pErrorCode)) {

1474 return 0;

1475 }

     StringPiece src(name, length<0 ? static_cast<int32_t>(uprv_strlen(name)) : length);

1477 CheckedArrayByteSink sink(dest, capacity);

1478 IDNAInfo info;

     reinterpret_cast<const IDNA *>(idna)->nameToUnicodeUTF8(src, sink, info, *pErrorCode);

1480 idnaInfoToStruct(info, pInfo);

     return u_terminateChars(dest, capacity, sink.NumberOfBytesAppended(), pErrorCode);

1482 }

1483

1484 #endif // UCONFIG_NO_IDNA