icuSources/i18n/inputext.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /*
   4  **********************************************************************
   5  *   Copyright (C) 2005-2016, International Business Machines
   6  *   Corporation and others.  All Rights Reserved.
   7  **********************************************************************
   8  */
   9
  10 #include "unicode/utypes.h"
  11
  12 #if !UCONFIG_NO_CONVERSION
  13
  14 #include "inputext.h"
  15
  16 #include "cmemory.h"
  17 #include "cstring.h"
  18
  19 #include <string.h>
  20
  21 U_NAMESPACE_BEGIN
  22
  23 #define BUFFER_SIZE 8192
  24
  25 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
  26 #define DELETE_ARRAY(array) uprv_free((void *) (array))
  27
  28 InputText::InputText(UErrorCode &status)
  29     : fInputBytes(NEW_ARRAY(uint8_t, BUFFER_SIZE)), // The text to be checked.  Markup will have been
  30                                                  //   removed if appropriate.
  31       fByteStats(NEW_ARRAY(int16_t, 256)),       // byte frequency statistics for the input text.
  32                                                  //   Value is percent, not absolute.
  33       fDeclaredEncoding(0),
  34       fRawInput(0),
  35       fRawLength(0)
  36 {
  37     if (fInputBytes == NULL || fByteStats == NULL) {
  38         status = U_MEMORY_ALLOCATION_ERROR;
  39     }
  40 }
  41
  42 InputText::~InputText()
  43 {
  44     DELETE_ARRAY(fDeclaredEncoding);
  45     DELETE_ARRAY(fByteStats);
  46     DELETE_ARRAY(fInputBytes);
  47 }
  48
  49 void InputText::setText(const char *in, int32_t len)
  50 {
  51     fInputLen  = 0;
  52     fC1Bytes   = FALSE;
  53     fRawInput  = (const uint8_t *) in;
  54     fRawLength = len == -1? (int32_t)uprv_strlen(in) : len;
  55 }
  56
  57 void InputText::setDeclaredEncoding(const char* encoding, int32_t len)
  58 {
  59     if(encoding) {
  60         if (len == -1) {
  61             len = (int32_t)uprv_strlen(encoding);
  62         }
  63
  64         len += 1;     // to make place for the \0 at the end.
  65         uprv_free(fDeclaredEncoding);
  66         fDeclaredEncoding = NEW_ARRAY(char, len);
  67         uprv_strncpy(fDeclaredEncoding, encoding, len);
  68     }
  69 }
  70
  71 UBool InputText::isSet() const
  72 {
  73     return fRawInput != NULL;
  74 }
  75
  76 /**
  77 *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
  78 *               it by removing what appears to be html markup. Currently only used
  79 *               by CharsetDetector::detectAll.
  80 *
  81 * @internal
  82 */
  83 void InputText::MungeInput(UBool fStripTags) {
  84     int     srci = 0;
  85     int     dsti = 0;
  86     uint8_t b;
  87     bool    inMarkup = FALSE;
  88     bool    inCSSDecl = FALSE;
  89     int32_t openTags = 0;
  90     int32_t badTags  = 0;
  91
  92     //
  93     //  html / xml markup stripping.
  94     //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
  95     //     discard everything within < brackets >
  96     //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
  97     //     guess as to whether the input was actually marked up at all.
  98     // TODO: Think about how this interacts with EBCDIC charsets that are detected.
  99     if (fStripTags) {
 100         for (srci = 0; srci < fRawLength && dsti < BUFFER_SIZE; srci += 1) {
 101             b = fRawInput[srci];
 102
 103             if ((b == (uint8_t)0x3C) && !inCSSDecl) { /* Check for the ASCII '<' */
 104                 if (inMarkup) {
 105                     badTags += 1;
 106                 }
 107                 inMarkup = TRUE;
 108                 openTags += 1;
 109             }
 110
 111             if ((b == (uint8_t)0x7B) && !inMarkup) { /* Check for the ASCII '{' */
 112                 if (inCSSDecl) {
 113                     badTags += 1;
 114                 }
 115                 inCSSDecl = TRUE;
 116                 openTags += 1;
 117             }
 118
 119             if (!inMarkup && !inCSSDecl) {
 120                 fInputBytes[dsti++] = b;
 121             }
 122
 123             if (b == (uint8_t)0x3E) { /* Check for the ASCII '>' */
 124                 inMarkup = FALSE;
 125             }
 126             if (b == (uint8_t)0x7D) { /* Check for the ASCII '}' */
 127                 inCSSDecl = FALSE;
 128             }
 129         }
 130
 131         fInputLen = dsti;
 132     }
 133
 134     //
 135     //  If it looks like this input wasn't marked up, or if it looks like it's
 136     //    essentially nothing but markup abandon the markup stripping.
 137     //    Detection will have to work on the unstripped input.
 138     //
 139     if (openTags<5 || openTags/5 < badTags ||
 140         (fInputLen < 100 && fRawLength>600))
 141     {
 142         int32_t limit = fRawLength;
 143
 144         if (limit > BUFFER_SIZE) {
 145             limit = BUFFER_SIZE;
 146         }
 147
 148         for (srci=0; srci<limit; srci++) {
 149             fInputBytes[srci] = fRawInput[srci];
 150         }
 151
 152         fInputLen = srci;
 153     }
 154
 155     //
 156     // Tally up the byte occurence statistics.
 157     // These are available for use by the various detectors.
 158     //
 159
 160     uprv_memset(fByteStats, 0, (sizeof fByteStats[0]) * 256);
 161
 162     for (srci = 0; srci < fInputLen; srci += 1) {
 163         fByteStats[fInputBytes[srci]] += 1;
 164     }
 165
 166     for (int32_t i = 0x80; i <= 0x9F; i += 1) {
 167         if (fByteStats[i] != 0) {
 168             fC1Bytes = TRUE;
 169             break;
 170         }
 171     }
 172 }
 173
 174 U_NAMESPACE_END
 175 #endif
 176