icuSources/i18n/unicode/ucsdet.h

   1 /*
   2  **********************************************************************
   3  *   Copyright (C) 2005-2013, International Business Machines
   4  *   Corporation and others.  All Rights Reserved.
   5  **********************************************************************
   6  *   file name:  ucsdet.h
   7  *   encoding:   US-ASCII
   8  *   indentation:4
   9  *
  10  *   created on: 2005Aug04
  11  *   created by: Andy Heninger
  12  *
  13  *   ICU Character Set Detection, API for C
  14  *
  15  *   Draft version 18 Oct 2005
  16  *
  17  */
  18
  19 #ifndef __UCSDET_H
  20 #define __UCSDET_H
  21
  22 #include "unicode/utypes.h"
  23
  24 #if !UCONFIG_NO_CONVERSION
  25
  26 #include "unicode/localpointer.h"
  27 #include "unicode/uenum.h"
  28
  29 /**
  30  * \file
  31  * \brief C API: Charset Detection API
  32  *
  33  * This API provides a facility for detecting the
  34  * charset or encoding of character data in an unknown text format.
  35  * The input data can be from an array of bytes.
  36  * <p>
  37  * Character set detection is at best an imprecise operation.  The detection
  38  * process will attempt to identify the charset that best matches the characteristics
  39  * of the byte data, but the process is partly statistical in nature, and
  40  * the results can not be guaranteed to always be correct.
  41  * <p>
  42  * For best accuracy in charset detection, the input data should be primarily
  43  * in a single language, and a minimum of a few hundred bytes worth of plain text
  44  * in the language are needed.  The detection process will attempt to
  45  * ignore html or xml style markup that could otherwise obscure the content.
  46  */
  47
  48
  49 struct UCharsetDetector;
  50 /**
  51   * Structure representing a charset detector
  52   * @stable ICU 3.6
  53   */
  54 typedef struct UCharsetDetector UCharsetDetector;
  55
  56 struct UCharsetMatch;
  57 /**
  58   *  Opaque structure representing a match that was identified
  59   *  from a charset detection operation.
  60   *  @stable ICU 3.6
  61   */
  62 typedef struct UCharsetMatch UCharsetMatch;
  63
  64 /**
  65   *  Open a charset detector.
  66   *
  67   *  @param status Any error conditions occurring during the open
  68   *                operation are reported back in this variable.
  69   *  @return the newly opened charset detector.
  70   *  @stable ICU 3.6
  71   */
  72 U_STABLE UCharsetDetector * U_EXPORT2
  73 ucsdet_open(UErrorCode   *status);
  74
  75 /**
  76   * Close a charset detector.  All storage and any other resources
  77   *   owned by this charset detector will be released.  Failure to
  78   *   close a charset detector when finished with it can result in
  79   *   memory leaks in the application.
  80   *
  81   *  @param ucsd  The charset detector to be closed.
  82   *  @stable ICU 3.6
  83   */
  84 U_STABLE void U_EXPORT2
  85 ucsdet_close(UCharsetDetector *ucsd);
  86
  87 #if U_SHOW_CPLUSPLUS_API
  88
  89 U_NAMESPACE_BEGIN
  90
  91 /**
  92  * \class LocalUCharsetDetectorPointer
  93  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
  94  * For most methods see the LocalPointerBase base class.
  95  *
  96  * @see LocalPointerBase
  97  * @see LocalPointer
  98  * @stable ICU 4.4
  99  */
 100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
 101
 102 U_NAMESPACE_END
 103
 104 #endif
 105
 106 /**
 107   * Set the input byte data whose charset is to detected.
 108   *
 109   * Ownership of the input  text byte array remains with the caller.
 110   * The input string must not be altered or deleted until the charset
 111   * detector is either closed or reset to refer to different input text.
 112   *
 113   * @param ucsd   the charset detector to be used.
 114   * @param textIn the input text of unknown encoding.   .
 115   * @param len    the length of the input text, or -1 if the text
 116   *               is NUL terminated.
 117   * @param status any error conditions are reported back in this variable.
 118   *
 119   * @stable ICU 3.6
 120   */
 121 U_STABLE void U_EXPORT2
 122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
 123
 124
 125 /** Set the declared encoding for charset detection.
 126  *  The declared encoding of an input text is an encoding obtained
 127  *  by the user from an http header or xml declaration or similar source that
 128  *  can be provided as an additional hint to the charset detector.
 129  *
 130  *  How and whether the declared encoding will be used during the
 131  *  detection process is TBD.
 132  *
 133  * @param ucsd      the charset detector to be used.
 134  * @param encoding  an encoding for the current data obtained from
 135  *                  a header or declaration or other source outside
 136  *                  of the byte data itself.
 137  * @param length    the length of the encoding name, or -1 if the name string
 138  *                  is NUL terminated.
 139  * @param status    any error conditions are reported back in this variable.
 140  *
 141  * @stable ICU 3.6
 142  */
 143 U_STABLE void U_EXPORT2
 144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
 145
 146
 147 /**
 148  * Return the charset that best matches the supplied input data.
 149  *
 150  * Note though, that because the detection
 151  * only looks at the start of the input data,
 152  * there is a possibility that the returned charset will fail to handle
 153  * the full set of input data.
 154  * <p>
 155  * The returned UCharsetMatch object is owned by the UCharsetDetector.
 156  * It will remain valid until the detector input is reset, or until
 157  * the detector is closed.
 158  * <p>
 159  * The function will fail if
 160  *  <ul>
 161  *    <li>no charset appears to match the data.</li>
 162  *    <li>no input text has been provided</li>
 163  *  </ul>
 164  *
 165  * @param ucsd      the charset detector to be used.
 166  * @param status    any error conditions are reported back in this variable.
 167  * @return          a UCharsetMatch  representing the best matching charset,
 168  *                  or NULL if no charset matches the byte data.
 169  *
 170  * @stable ICU 3.6
 171  */
 172 U_STABLE const UCharsetMatch * U_EXPORT2
 173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
 174
 175
 176 /**
 177  *  Find all charset matches that appear to be consistent with the input,
 178  *  returning an array of results.  The results are ordered with the
 179  *  best quality match first.
 180  *
 181  *  Because the detection only looks at a limited amount of the
 182  *  input byte data, some of the returned charsets may fail to handle
 183  *  the all of input data.
 184  *  <p>
 185  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
 186  *  They will remain valid until the detector is closed or modified
 187  *
 188  * <p>
 189  * Return an error if
 190  *  <ul>
 191  *    <li>no charsets appear to match the input data.</li>
 192  *    <li>no input text has been provided</li>
 193  *  </ul>
 194  *
 195  * @param ucsd          the charset detector to be used.
 196  * @param matchesFound  pointer to a variable that will be set to the
 197  *                      number of charsets identified that are consistent with
 198  *                      the input data.  Output only.
 199  * @param status        any error conditions are reported back in this variable.
 200  * @return              A pointer to an array of pointers to UCharSetMatch objects.
 201  *                      This array, and the UCharSetMatch instances to which it refers,
 202  *                      are owned by the UCharsetDetector, and will remain valid until
 203  *                      the detector is closed or modified.
 204  * @stable ICU 3.6
 205  */
 206 U_STABLE const UCharsetMatch ** U_EXPORT2
 207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
 208
 209
 210
 211 /**
 212  *  Get the name of the charset represented by a UCharsetMatch.
 213  *
 214  *  The storage for the returned name string is owned by the
 215  *  UCharsetMatch, and will remain valid while the UCharsetMatch
 216  *  is valid.
 217  *
 218  *  The name returned is suitable for use with the ICU conversion APIs.
 219  *
 220  *  @param ucsm    The charset match object.
 221  *  @param status  Any error conditions are reported back in this variable.
 222  *  @return        The name of the matching charset.
 223  *
 224  *  @stable ICU 3.6
 225  */
 226 U_STABLE const char * U_EXPORT2
 227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
 228
 229 /**
 230  *  Get a confidence number for the quality of the match of the byte
 231  *  data with the charset.  Confidence numbers range from zero to 100,
 232  *  with 100 representing complete confidence and zero representing
 233  *  no confidence.
 234  *
 235  *  The confidence values are somewhat arbitrary.  They define an
 236  *  an ordering within the results for any single detection operation
 237  *  but are not generally comparable between the results for different input.
 238  *
 239  *  A confidence value of ten does have a general meaning - it is used
 240  *  for charsets that can represent the input data, but for which there
 241  *  is no other indication that suggests that the charset is the correct one.
 242  *  Pure 7 bit ASCII data, for example, is compatible with a
 243  *  great many charsets, most of which will appear as possible matches
 244  *  with a confidence of 10.
 245  *
 246  *  @param ucsm    The charset match object.
 247  *  @param status  Any error conditions are reported back in this variable.
 248  *  @return        A confidence number for the charset match.
 249  *
 250  *  @stable ICU 3.6
 251  */
 252 U_STABLE int32_t U_EXPORT2
 253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
 254
 255 /**
 256  *  Get the RFC 3066 code for the language of the input data.
 257  *
 258  *  The Charset Detection service is intended primarily for detecting
 259  *  charsets, not language.  For some, but not all, charsets, a language is
 260  *  identified as a byproduct of the detection process, and that is what
 261  *  is returned by this function.
 262  *
 263  *  CAUTION:
 264  *    1.  Language information is not available for input data encoded in
 265  *        all charsets. In particular, no language is identified
 266  *        for UTF-8 input data.
 267  *
 268  *    2.  Closely related languages may sometimes be confused.
 269  *
 270  *  If more accurate language detection is required, a linguistic
 271  *  analysis package should be used.
 272  *
 273  *  The storage for the returned name string is owned by the
 274  *  UCharsetMatch, and will remain valid while the UCharsetMatch
 275  *  is valid.
 276  *
 277  *  @param ucsm    The charset match object.
 278  *  @param status  Any error conditions are reported back in this variable.
 279  *  @return        The RFC 3066 code for the language of the input data, or
 280  *                 an empty string if the language could not be determined.
 281  *
 282  *  @stable ICU 3.6
 283  */
 284 U_STABLE const char * U_EXPORT2
 285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
 286
 287
 288 /**
 289   *  Get the entire input text as a UChar string, placing it into
 290   *  a caller-supplied buffer.  A terminating
 291   *  NUL character will be appended to the buffer if space is available.
 292   *
 293   *  The number of UChars in the output string, not including the terminating
 294   *  NUL, is returned.
 295   *
 296   *  If the supplied buffer is smaller than required to hold the output,
 297   *  the contents of the buffer are undefined.  The full output string length
 298   *  (in UChars) is returned as always, and can be used to allocate a buffer
 299   *  of the correct size.
 300   *
 301   *
 302   * @param ucsm    The charset match object.
 303   * @param buf     A UChar buffer to be filled with the converted text data.
 304   * @param cap     The capacity of the buffer in UChars.
 305   * @param status  Any error conditions are reported back in this variable.
 306   * @return        The number of UChars in the output string.
 307   *
 308   * @stable ICU 3.6
 309   */
 310 U_STABLE  int32_t U_EXPORT2
 311 ucsdet_getUChars(const UCharsetMatch *ucsm,
 312                  UChar *buf, int32_t cap, UErrorCode *status);
 313
 314
 315
 316 /**
 317   *  Get an iterator over the set of all detectable charsets -
 318   *  over the charsets that are known to the charset detection
 319   *  service.
 320   *
 321   *  The returned UEnumeration provides access to the names of
 322   *  the charsets.
 323   *
 324   *  <p>
 325   *  The state of the Charset detector that is passed in does not
 326   *  affect the result of this function, but requiring a valid, open
 327   *  charset detector as a parameter insures that the charset detection
 328   *  service has been safely initialized and that the required detection
 329   *  data is available.
 330   *
 331   *  <p>
 332   *  <b>Note:</b> Multiple different charset encodings in a same family may use
 333   *  a single shared name in this implementation. For example, this method returns
 334   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
 335   *  (Windows Latin 1). However, actual detection result could be "windows-1252"
 336   *  when the input data matches Latin 1 code points with any points only available
 337   *  in "windows-1252".
 338   *
 339   *  @param ucsd a Charset detector.
 340   *  @param status  Any error conditions are reported back in this variable.
 341   *  @return an iterator providing access to the detectable charset names.
 342   *  @stable ICU 3.6
 343   */
 344 U_STABLE  UEnumeration * U_EXPORT2
 345 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
 346
 347 /**
 348   *  Test whether input filtering is enabled for this charset detector.
 349   *  Input filtering removes text that appears to be HTML or xml
 350   *  markup from the input before applying the code page detection
 351   *  heuristics.
 352   *
 353   *  @param ucsd  The charset detector to check.
 354   *  @return TRUE if filtering is enabled.
 355   *  @stable ICU 3.6
 356   */
 357
 358 U_STABLE  UBool U_EXPORT2
 359 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
 360
 361
 362 /**
 363  * Enable filtering of input text. If filtering is enabled,
 364  * text within angle brackets ("<" and ">") will be removed
 365  * before detection, which will remove most HTML or xml markup.
 366  *
 367  * @param ucsd   the charset detector to be modified.
 368  * @param filter <code>true</code> to enable input text filtering.
 369  * @return The previous setting.
 370  *
 371  * @stable ICU 3.6
 372  */
 373 U_STABLE  UBool U_EXPORT2
 374 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
 375
 376 #ifndef U_HIDE_INTERNAL_API
 377 /**
 378   *  Get an iterator over the set of detectable charsets -
 379   *  over the charsets that are enabled by the specified charset detector.
 380   *
 381   *  The returned UEnumeration provides access to the names of
 382   *  the charsets.
 383   *
 384   *  @param ucsd a Charset detector.
 385   *  @param status  Any error conditions are reported back in this variable.
 386   *  @return an iterator providing access to the detectable charset names by
 387   *  the specified charset detector.
 388   *  @internal
 389   */
 390 U_INTERNAL UEnumeration * U_EXPORT2
 391 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
 392
 393 /**
 394   * Enable or disable individual charset encoding.
 395   * A name of charset encoding must be included in the names returned by
 396   * {@link #getAllDetectableCharsets()}.
 397   *
 398   * @param ucsd a Charset detector.
 399   * @param encoding encoding the name of charset encoding.
 400   * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
 401   *   charset encoding.
 402   * @param status receives the return status. When the name of charset encoding
 403   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
 404   * @internal
 405   */
 406 U_INTERNAL void U_EXPORT2
 407 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
 408 #endif  /* U_HIDE_INTERNAL_API */
 409
 410 #endif
 411 #endif   /* __UCSDET_H */
 412
 413