ICU-64243.0.1.tar.gz

[apple/icu.git] / icuSources / common / normalizer2impl.h
diff --git a/icuSources/common/normalizer2impl.h b/icuSources/common/normalizer2impl.h

index 9dd4d1e5ab188b699e72ec0ba54ca1929182e09e..cf3015ea881bfc7e4c8e7147a4ea55b55859a225 100644 (file)
--- a/icuSources/common/normalizer2impl.h
+++ b/icuSources/common/normalizer2impl.h
@@ -24,12 +24,20 @@
  #if !UCONFIG_NO_NORMALIZATION
  
  #include "unicode/normalizer2.h"
+#include "unicode/ucptrie.h"
  #include "unicode/unistr.h"
  #include "unicode/unorm.h"
+#include "unicode/utf.h"
  #include "unicode/utf16.h"
  #include "mutex.h"
+#include "udataswp.h"
  #include "uset_imp.h"
-#include "utrie2.h"
+
+// When the nfc.nrm data is *not* hardcoded into the common library
+// (with this constant set to 0),
+// then it needs to be built into the data package:
+// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT
+#define NORM2_HARDCODE_NFC_DATA 1
  
  U_NAMESPACE_BEGIN
  
@@ -118,7 +126,7 @@ public:
              buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
              buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
          } else {
-            buffer[0]=orig-c2;  // LV syllable
+            buffer[0]=(UChar)(orig-c2);  // LV syllable
              buffer[1]=(UChar)(JAMO_T_BASE+c2);
          }
      }
@@ -158,8 +166,7 @@ public:
              appendBMP((UChar)c, cc, errorCode) :
              appendSupplementary(c, cc, errorCode);
      }
-    // s must be in NFD, otherwise change the implementation.
-    UBool append(const UChar *s, int32_t length,
+    UBool append(const UChar *s, int32_t length, UBool isNFD,
                   uint8_t leadCC, uint8_t trailCC,
                   UErrorCode &errorCode);
      UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
@@ -238,12 +245,10 @@ private:
   */
  class U_COMMON_API Normalizer2Impl : public UObject {
  public:
-    Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) {
-        fCanonIterDataInitOnce.reset();
-    }
+    Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) { }
      virtual ~Normalizer2Impl();
  
-    void init(const int32_t *inIndexes, const UTrie2 *inTrie,
+    void init(const int32_t *inIndexes, const UCPTrie *inTrie,
                const uint16_t *inExtraData, const uint8_t *inSmallFCD);
  
      void addLcccChars(UnicodeSet &set) const;
@@ -254,7 +259,14 @@ public:
  
      UBool ensureCanonIterData(UErrorCode &errorCode) const;
  
-    uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
+    // The trie stores values for lead surrogate code *units*.
+    // Surrogate code *points* are inert.
+    uint16_t getNorm16(UChar32 c) const {
+        return U_IS_LEAD(c) ?
+            static_cast<uint16_t>(INERT) :
+            UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c);
+    }
+    uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); }
  
      UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
          if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
@@ -704,12 +716,12 @@ private:
      uint16_t centerNoNoDelta;
      uint16_t minMaybeYes;
  
-    const UTrie2 *normTrie;
+    const UCPTrie *normTrie;
      const uint16_t *maybeYesCompositions;
      const uint16_t *extraData;  // mappings and/or compositions for yesYes, yesNo & noNo characters
      const uint8_t *smallFCD;  // [0x100] one bit per 32 BMP code points, set if any FCD!=0
  
-    UInitOnce       fCanonIterDataInitOnce;
+    UInitOnce       fCanonIterDataInitOnce = U_INITONCE_INITIALIZER;
      CanonIterData  *fCanonIterData;
  };
  
@@ -764,7 +776,7 @@ unorm_getFCD16(UChar32 c);
  
  /**
   * Format of Normalizer2 .nrm data files.
- * Format version 3.0.
+ * Format version 4.0.
   *
   * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
   * ICU ships with data files for standard Unicode Normalization Forms
@@ -818,7 +830,7 @@ unorm_getFCD16(UChar32 c);
   *          minMaybeYes=indexes[IX_MIN_MAYBE_YES];
   *      See the normTrie description below and the design doc for details.
   *
- * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
+ * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie
   *
   *      The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
   *      Rather than using independent bits in the value (which would require more than 16 bits),
@@ -946,6 +958,20 @@ unorm_getFCD16(UChar32 c);
   *   which is artificially assigned "worst case" values lccc=1 and tccc=255.
   *
   * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
+ *
+ * Changes from format version 3 to format version 4 (ICU 63) ------------------
+ *
+ * Switched from UTrie2 to UCPTrie/CodePointTrie.
+ *
+ * The new trie no longer stores different values for surrogate code *units* vs.
+ * surrogate code *points*.
+ * Lead surrogates still have values for optimized UTF-16 string processing.
+ * When looking up code point properties, the code now checks for lead surrogates and
+ * treats them as inert.
+ *
+ * gennorm2 now has to reject mappings for surrogate code points.
+ * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its
+ * custom normalization data file.
   */
  
  #endif  /* !UCONFIG_NO_NORMALIZATION */