ICU-64260.0.1.tar.gz

[apple/icu.git] / icuSources / i18n / unicode / ucsdet.h
diff --git a/icuSources/i18n/unicode/ucsdet.h b/icuSources/i18n/unicode/ucsdet.h

index 27e2e34e6aa710e46d57281e62fb77cf922ac392..892f3ee41265db4c6e970e5e9139dd1be6202ba5 100644 (file)
--- a/icuSources/i18n/unicode/ucsdet.h
+++ b/icuSources/i18n/unicode/ucsdet.h
@@ -1,10 +1,12 @@
+// © 2016 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
  /*
   **********************************************************************
- *   Copyright (C) 2005-2006, International Business Machines
+ *   Copyright (C) 2005-2013, International Business Machines
   *   Corporation and others.  All Rights Reserved.
   **********************************************************************
   *   file name:  ucsdet.h
- *   encoding:   US-ASCII
+ *   encoding:   UTF-8
   *   indentation:4
   *
   *   created on: 2005Aug04
@@ -22,6 +24,8 @@
  #include "unicode/utypes.h"
  
  #if !UCONFIG_NO_CONVERSION
+
+#include "unicode/localpointer.h"
  #include "unicode/uenum.h"
  
  /**
@@ -41,13 +45,17 @@
   * in a single language, and a minimum of a few hundred bytes worth of plain text
   * in the language are needed.  The detection process will attempt to
   * ignore html or xml style markup that could otherwise obscure the content.
+ * <p>
+ * An alternative to the ICU Charset Detector is the
+ * Compact Encoding Detector, https://github.com/google/compact_enc_det.
+ * It often gives more accurate results, especially with short input samples.
   */
   
  
  struct UCharsetDetector;
  /**
    * Structure representing a charset detector
-  * @draft ICU 3.6
+  * @stable ICU 3.6
    */
  typedef struct UCharsetDetector UCharsetDetector;
  
@@ -55,7 +63,7 @@ struct UCharsetMatch;
  /**
    *  Opaque structure representing a match that was identified
    *  from a charset detection operation.
-  *  @draft ICU 3.6
+  *  @stable ICU 3.6
    */
  typedef struct UCharsetMatch UCharsetMatch;
  
@@ -65,9 +73,9 @@ typedef struct UCharsetMatch UCharsetMatch;
    *  @param status Any error conditions occurring during the open
    *                operation are reported back in this variable.
    *  @return the newly opened charset detector.
-  *  @draft ICU 3.6
+  *  @stable ICU 3.6
    */
-U_DRAFT UCharsetDetector * U_EXPORT2
+U_STABLE UCharsetDetector * U_EXPORT2
  ucsdet_open(UErrorCode   *status);
  
  /**
@@ -77,11 +85,30 @@ ucsdet_open(UErrorCode   *status);
    *   memory leaks in the application.
    *
    *  @param ucsd  The charset detector to be closed.
-  *  @draft ICU 3.6
+  *  @stable ICU 3.6
    */
-U_DRAFT void U_EXPORT2
+U_STABLE void U_EXPORT2
  ucsdet_close(UCharsetDetector *ucsd);
  
+#if U_SHOW_CPLUSPLUS_API
+
+U_NAMESPACE_BEGIN
+
+/**
+ * \class LocalUCharsetDetectorPointer
+ * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
+ * For most methods see the LocalPointerBase base class.
+ *
+ * @see LocalPointerBase
+ * @see LocalPointer
+ * @stable ICU 4.4
+ */
+U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
+
+U_NAMESPACE_END
+
+#endif // U_SHOW_CPLUSPLUS_API
+
  /**
    * Set the input byte data whose charset is to detected.
    *
@@ -95,9 +122,9 @@ ucsdet_close(UCharsetDetector *ucsd);
    *               is NUL terminated.
    * @param status any error conditions are reported back in this variable.
    *
-  * @draft ICU 3.6
+  * @stable ICU 3.6
    */
-U_DRAFT void U_EXPORT2
+U_STABLE void U_EXPORT2
  ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
  
  
@@ -117,9 +144,9 @@ ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCo
   *                  is NUL terminated.
   * @param status    any error conditions are reported back in this variable.
   *
- * @draft ICU 3.6
+ * @stable ICU 3.6
   */
-U_DRAFT void U_EXPORT2
+U_STABLE void U_EXPORT2
  ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
  
  
@@ -146,9 +173,9 @@ ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t
   * @return          a UCharsetMatch  representing the best matching charset,
   *                  or NULL if no charset matches the byte data.
   *
- * @draft ICU 3.6
+ * @stable ICU 3.6
   */
-U_DRAFT const UCharsetMatch * U_EXPORT2
+U_STABLE const UCharsetMatch * U_EXPORT2
  ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
      
  
@@ -180,9 +207,9 @@ ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
   *                      This array, and the UCharSetMatch instances to which it refers,
   *                      are owned by the UCharsetDetector, and will remain valid until
   *                      the detector is closed or modified.
- * @draft ICU 3.4
+ * @stable ICU 3.6
   */
-U_DRAFT const UCharsetMatch ** U_EXPORT2
+U_STABLE const UCharsetMatch ** U_EXPORT2
  ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
  
  
@@ -200,9 +227,9 @@ ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *stat
   *  @param status  Any error conditions are reported back in this variable.
   *  @return        The name of the matching charset.
   *
- *  @draft ICU 3.6
+ *  @stable ICU 3.6
   */
-U_DRAFT const char * U_EXPORT2
+U_STABLE const char * U_EXPORT2
  ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
  
  /**
@@ -226,9 +253,9 @@ ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
   *  @param status  Any error conditions are reported back in this variable.
   *  @return        A confidence number for the charset match.
   *
- *  @draft ICU 3.6
+ *  @stable ICU 3.6
   */
-U_DRAFT int32_t U_EXPORT2
+U_STABLE int32_t U_EXPORT2
  ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
  
  /**
@@ -258,9 +285,9 @@ ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
   *  @return        The RFC 3066 code for the language of the input data, or
   *                 an empty string if the language could not be determined.
   *
- *  @draft ICU 3.6
+ *  @stable ICU 3.6
   */
-U_DRAFT const char * U_EXPORT2
+U_STABLE const char * U_EXPORT2
  ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
  
  
@@ -284,9 +311,9 @@ ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
    * @param status  Any error conditions are reported back in this variable.
    * @return        The number of UChars in the output string.
    *
-  * @draft ICU 3.6
+  * @stable ICU 3.6
    */
-U_DRAFT  int32_t U_EXPORT2
+U_STABLE  int32_t U_EXPORT2
  ucsdet_getUChars(const UCharsetMatch *ucsm,
                   UChar *buf, int32_t cap, UErrorCode *status);
  
@@ -300,33 +327,42 @@ ucsdet_getUChars(const UCharsetMatch *ucsm,
    *  The returned UEnumeration provides access to the names of
    *  the charsets.
    *
+  *  <p>
    *  The state of the Charset detector that is passed in does not
    *  affect the result of this function, but requiring a valid, open
    *  charset detector as a parameter insures that the charset detection
    *  service has been safely initialized and that the required detection
    *  data is available.
    *
+  *  <p>
+  *  <b>Note:</b> Multiple different charset encodings in a same family may use
+  *  a single shared name in this implementation. For example, this method returns
+  *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+  *  (Windows Latin 1). However, actual detection result could be "windows-1252"
+  *  when the input data matches Latin 1 code points with any points only available
+  *  in "windows-1252".
+  *
    *  @param ucsd a Charset detector.
    *  @param status  Any error conditions are reported back in this variable.
    *  @return an iterator providing access to the detectable charset names.
-  *  @draft ICU 3.6
+  *  @stable ICU 3.6
    */
-
-U_DRAFT  UEnumeration * U_EXPORT2
+U_STABLE  UEnumeration * U_EXPORT2
  ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
  
-
  /**
    *  Test whether input filtering is enabled for this charset detector.
    *  Input filtering removes text that appears to be HTML or xml
    *  markup from the input before applying the code page detection
-  *  heuristics.
+  *  heuristics. Apple addition per <rdar://problem/48093252>: Will also
+  *  remove text that appears to be CSS declaration blocks.
    *
    *  @param ucsd  The charset detector to check.
    *  @return TRUE if filtering is enabled.
-  *  @draft ICU 3.4
+  *  @stable ICU 3.6
    */
-U_DRAFT  UBool U_EXPORT2
+
+U_STABLE  UBool U_EXPORT2
  ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
  
  
@@ -334,16 +370,52 @@ ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
   * Enable filtering of input text. If filtering is enabled,
   * text within angle brackets ("<" and ">") will be removed
   * before detection, which will remove most HTML or xml markup.
+ * Apple addition per <rdar://problem/48093252>: Will also
+ * remove text between '{' and '}', e.g. CSS declaration blocks.
   *
   * @param ucsd   the charset detector to be modified.
   * @param filter <code>true</code> to enable input text filtering.
   * @return The previous setting.
   *
- * @draft ICU 3.6
+ * @stable ICU 3.6
   */
-U_DRAFT  UBool U_EXPORT2
+U_STABLE  UBool U_EXPORT2
  ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
  
+#ifndef U_HIDE_INTERNAL_API
+/**
+  *  Get an iterator over the set of detectable charsets -
+  *  over the charsets that are enabled by the specified charset detector.
+  *
+  *  The returned UEnumeration provides access to the names of
+  *  the charsets.
+  *
+  *  @param ucsd a Charset detector.
+  *  @param status  Any error conditions are reported back in this variable.
+  *  @return an iterator providing access to the detectable charset names by
+  *  the specified charset detector.
+  *  @internal
+  */
+U_INTERNAL UEnumeration * U_EXPORT2
+ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
+
+/**
+  * Enable or disable individual charset encoding.
+  * A name of charset encoding must be included in the names returned by
+  * {@link #ucsdet_getAllDetectableCharsets()}.
+  *
+  * @param ucsd a Charset detector.
+  * @param encoding encoding the name of charset encoding.
+  * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
+  *   charset encoding.
+  * @param status receives the return status. When the name of charset encoding
+  *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
+  * @internal
+  */
+U_INTERNAL void U_EXPORT2
+ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
+#endif  /* U_HIDE_INTERNAL_API */
+
  #endif
  #endif   /* __UCSDET_H */