icuSources/tools/genrb/rle.c

   1 /*
   2 *******************************************************************************
   3 *
   4 *   Copyright (C) 2000-2003, International Business Machines
   5 *   Corporation and others.  All Rights Reserved.
   6 *
   7 *******************************************************************************
   8 *
   9 * File writejava.c
  10 *
  11 * Modification History:
  12 *
  13 *   Date        Name        Description
  14 *   01/11/02    Ram        Creation.
  15 *******************************************************************************
  16 */
  17 #include "rle.h"
  18 /**
  19  * The ESCAPE character is used during run-length encoding.  It signals
  20  * a run of identical chars.
  21  */
  22 static const uint16_t ESCAPE = 0xA5A5;
  23
  24 /**
  25  * The ESCAPE_BYTE character is used during run-length encoding.  It signals
  26  * a run of identical bytes.
  27  */
  28 static const uint8_t ESCAPE_BYTE = (uint8_t)0xA5;
  29
  30 /**
  31  * Append a byte to the given StringBuffer, packing two bytes into each
  32  * character.  The state parameter maintains intermediary data between
  33  * calls.
  34  * @param state A two-element array, with state[0] == 0 if this is the
  35  * first byte of a pair, or state[0] != 0 if this is the second byte
  36  * of a pair, in which case state[1] is the first byte.
  37  */
  38 static uint16_t*
  39 appendEncodedByte(uint16_t* buffer, uint16_t* buffLimit, uint8_t value, uint8_t state[],UErrorCode* status) {
  40     if(!status || U_FAILURE(*status)){
  41         return NULL;
  42     }
  43     if (state[0] != 0) {
  44         uint16_t c = (uint16_t) ((state[1] << 8) | (((int32_t) value) & 0xFF));
  45         if(buffer < buffLimit){
  46             *buffer++ = c;
  47         }else{
  48             *status = U_BUFFER_OVERFLOW_ERROR;
  49         }
  50         state[0] = 0;
  51         return buffer;
  52     }
  53     else {
  54         state[0] = 1;
  55         state[1] = value;
  56         return buffer;
  57     }
  58 }
  59 /**
  60  * Encode a run, possibly a degenerate run (of < 4 values).
  61  * @param length The length of the run; must be > 0 && <= 0xFF.
  62  */
  63 static uint16_t*
  64 encodeRunByte(uint16_t* buffer,uint16_t* bufLimit, uint8_t value, int32_t length, uint8_t state[], UErrorCode* status) {
  65     if(!status || U_FAILURE(*status)){
  66         return NULL;
  67     }
  68     if (length < 4) {
  69         int32_t j=0;
  70         for (; j<length; ++j) {
  71             if (value == ESCAPE_BYTE) {
  72                 buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status);
  73             }
  74             buffer = appendEncodedByte(buffer,bufLimit, value, state, status);
  75         }
  76     }
  77     else {
  78         if (length == ESCAPE_BYTE) {
  79             if (value == ESCAPE_BYTE){
  80                buffer =  appendEncodedByte(buffer, bufLimit,ESCAPE_BYTE, state,status);
  81             }
  82             buffer = appendEncodedByte(buffer,bufLimit, value, state, status);
  83             --length;
  84         }
  85         buffer = appendEncodedByte(buffer,bufLimit, ESCAPE_BYTE, state,status);
  86         buffer = appendEncodedByte(buffer,bufLimit, (char)length, state, status);
  87         buffer = appendEncodedByte(buffer,bufLimit, value, state, status); /* Don't need to escape this value*/
  88     }
  89     return buffer;
  90 }
  91
  92 #define APPEND( buffer, bufLimit, value, num, status){  \
  93     if(buffer<bufLimit){                                                                \
  94         *buffer++=(value);                                                              \
  95     }else{                                                                                              \
  96         *status = U_BUFFER_OVERFLOW_ERROR;                              \
  97     }                                                                                                   \
  98     num++;                                                                                              \
  99 }
 100
 101 /**
 102  * Encode a run, possibly a degenerate run (of < 4 values).
 103  * @param length The length of the run; must be > 0 && <= 0xFFFF.
 104  */
 105 static uint16_t*
 106 encodeRunShort(uint16_t* buffer,uint16_t* bufLimit, uint16_t value, int32_t length,UErrorCode* status) {
 107     int32_t num=0;
 108         if (length < 4) {
 109         int j=0;
 110         for (; j<length; ++j) {
 111             if (value == (int32_t) ESCAPE){
 112                 APPEND(buffer,bufLimit,ESCAPE, num, status);
 113
 114             }
 115             APPEND(buffer,bufLimit,value,num, status);
 116         }
 117     }
 118     else {
 119         if (length == (int32_t) ESCAPE) {
 120             if (value == (int32_t) ESCAPE){
 121                 APPEND(buffer,bufLimit,ESCAPE,num,status);
 122
 123             }
 124             APPEND(buffer,bufLimit,value,num,status);
 125             --length;
 126         }
 127         APPEND(buffer,bufLimit,ESCAPE,num,status);
 128         APPEND(buffer,bufLimit,(uint16_t) length, num,status);
 129         APPEND(buffer,bufLimit,(uint16_t)value, num, status); /* Don't need to escape this value */
 130     }
 131     return buffer;
 132 }
 133
 134 /**
 135  * Construct a string representing a char array.  Use run-length encoding.
 136  * A character represents itself, unless it is the ESCAPE character.  Then
 137  * the following notations are possible:
 138  *   ESCAPE ESCAPE   ESCAPE literal
 139  *   ESCAPE n c      n instances of character c
 140  * Since an encoded run occupies 3 characters, we only encode runs of 4 or
 141  * more characters.  Thus we have n > 0 and n != ESCAPE and n <= 0xFFFF.
 142  * If we encounter a run where n == ESCAPE, we represent this as:
 143  *   c ESCAPE n-1 c
 144  * The ESCAPE value is chosen so as not to collide with commonly
 145  * seen values.
 146  */
 147 int32_t
 148 usArrayToRLEString(const uint16_t* src,int32_t srcLen,uint16_t* buffer, int32_t bufLen,UErrorCode* status) {
 149     uint16_t* bufLimit =  buffer+bufLen;
 150         uint16_t* saveBuffer = buffer;
 151     if(buffer < bufLimit){
 152         *buffer++ =  (uint16_t)(srcLen>>16);
 153         if(buffer<bufLimit){
 154             uint16_t runValue = src[0];
 155             int32_t runLength = 1;
 156             int i=1;
 157             *buffer++ = (uint16_t) srcLen;
 158
 159             for (; i<srcLen; ++i) {
 160                 uint16_t s = src[i];
 161                 if (s == runValue && runLength < 0xFFFF){
 162                     ++runLength;
 163                 }else {
 164                     buffer = encodeRunShort(buffer,bufLimit, (uint16_t)runValue, runLength,status);
 165                     runValue = s;
 166                     runLength = 1;
 167                 }
 168             }
 169             buffer= encodeRunShort(buffer,bufLimit,(uint16_t)runValue, runLength,status);
 170         }else{
 171             *status = U_BUFFER_OVERFLOW_ERROR;
 172         }
 173     }else{
 174         *status = U_BUFFER_OVERFLOW_ERROR;
 175     }
 176     return (buffer - saveBuffer);
 177 }
 178
 179 /**
 180  * Construct a string representing a byte array.  Use run-length encoding.
 181  * Two bytes are packed into a single char, with a single extra zero byte at
 182  * the end if needed.  A byte represents itself, unless it is the
 183  * ESCAPE_BYTE.  Then the following notations are possible:
 184  *   ESCAPE_BYTE ESCAPE_BYTE   ESCAPE_BYTE literal
 185  *   ESCAPE_BYTE n b           n instances of byte b
 186  * Since an encoded run occupies 3 bytes, we only encode runs of 4 or
 187  * more bytes.  Thus we have n > 0 and n != ESCAPE_BYTE and n <= 0xFF.
 188  * If we encounter a run where n == ESCAPE_BYTE, we represent this as:
 189  *   b ESCAPE_BYTE n-1 b
 190  * The ESCAPE_BYTE value is chosen so as not to collide with commonly
 191  * seen values.
 192  */
 193 int32_t
 194 byteArrayToRLEString(const uint8_t* src,int32_t srcLen, uint16_t* buffer,int32_t bufLen, UErrorCode* status) {
 195     const uint16_t* saveBuf = buffer;
 196     uint16_t* bufLimit =  buffer+bufLen;
 197     if(buffer < bufLimit){
 198         *buffer++ = ((uint16_t) (srcLen >> 16));
 199
 200         if(buffer<bufLimit){
 201             uint8_t runValue = src[0];
 202             int runLength = 1;
 203             uint8_t state[2]= {0};
 204             int i=1;
 205             *buffer++=((uint16_t) srcLen);
 206             for (; i<srcLen; ++i) {
 207                 uint8_t b = src[i];
 208                 if (b == runValue && runLength < 0xFF){
 209                     ++runLength;
 210                 }
 211                 else {
 212                     buffer = encodeRunByte(buffer, bufLimit,runValue, runLength, state,status);
 213                     runValue = b;
 214                     runLength = 1;
 215                 }
 216             }
 217             buffer = encodeRunByte(buffer,bufLimit, runValue, runLength, state, status);
 218
 219             /* We must save the final byte, if there is one, by padding
 220              * an extra zero.
 221              */
 222             if (state[0] != 0) {
 223                 buffer = appendEncodedByte(buffer,bufLimit, 0, state ,status);
 224             }
 225         }else{
 226             *status = U_BUFFER_OVERFLOW_ERROR;
 227         }
 228     }else{
 229         *status = U_BUFFER_OVERFLOW_ERROR;
 230     }
 231     return (int32_t) (buffer - saveBuf);
 232 }
 233
 234
 235 /**
 236  * Construct an array of shorts from a run-length encoded string.
 237  */
 238 int32_t
 239 rleStringToUCharArray(uint16_t* src, int32_t srcLen, uint16_t* target, int32_t tgtLen, UErrorCode* status) {
 240     int32_t length = 0;
 241     int32_t ai = 0;
 242     int i=2;
 243
 244     if(!status || U_FAILURE(*status)){
 245         return 0;
 246     }
 247     /* the source is null terminated */
 248     if(srcLen == -1){
 249         srcLen = u_strlen(src);
 250     }
 251     if(srcLen <= 2){
 252         return 2;
 253     }
 254     length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]);
 255
 256     if(target == NULL){
 257         return length;
 258     }
 259     if(tgtLen < length){
 260         *status = U_BUFFER_OVERFLOW_ERROR;
 261         return length;
 262     }
 263
 264     for (; i<srcLen; ++i) {
 265         uint16_t c = src[i];
 266         if (c == ESCAPE) {
 267             c = src[++i];
 268             if (c == ESCAPE) {
 269                 target[ai++] = c;
 270             } else {
 271                 int32_t runLength = (int32_t) c;
 272                 uint16_t runValue = src[++i];
 273                 int j=0;
 274                 for (; j<runLength; ++j) {
 275                     target[ai++] = runValue;
 276                 }
 277             }
 278         }
 279         else {
 280             target[ai++] = c;
 281         }
 282     }
 283
 284     if (ai != length){
 285         *status = U_INTERNAL_PROGRAM_ERROR;
 286     }
 287
 288     return length;
 289 }
 290
 291 /**
 292  * Construct an array of bytes from a run-length encoded string.
 293  */
 294 int32_t
 295 rleStringToByteArray(uint16_t* src, int32_t srcLen, uint8_t* target, int32_t tgtLen, UErrorCode* status) {
 296
 297     int32_t length = 0;
 298     UBool nextChar = TRUE;
 299     uint16_t c = 0;
 300     int32_t node = 0;
 301     int32_t runLength = 0;
 302     int32_t i = 2;
 303     int32_t ai=0;
 304
 305     if(!status || U_FAILURE(*status)){
 306         return 0;
 307     }
 308     /* the source is null terminated */
 309     if(srcLen == -1){
 310         srcLen = u_strlen(src);
 311     }
 312     if(srcLen <= 2){
 313         return 2;
 314     }
 315     length = (((int32_t) src[0]) << 16) | ((int32_t) src[1]);
 316
 317     if(target == NULL){
 318         return length;
 319     }
 320     if(tgtLen < length){
 321         *status = U_BUFFER_OVERFLOW_ERROR;
 322         return length;
 323     }
 324
 325     for (; ai<tgtLen; ) {
 326        /* This part of the loop places the next byte into the local
 327         * variable 'b' each time through the loop.  It keeps the
 328         * current character in 'c' and uses the boolean 'nextChar'
 329         * to see if we've taken both bytes out of 'c' yet.
 330         */
 331         uint8_t b;
 332         if (nextChar) {
 333             c = src[i++];
 334             b = (uint8_t) (c >> 8);
 335             nextChar = FALSE;
 336         }
 337         else {
 338             b = (uint8_t) (c & 0xFF);
 339             nextChar = TRUE;
 340         }
 341
 342        /* This part of the loop is a tiny state machine which handles
 343         * the parsing of the run-length encoding.  This would be simpler
 344         * if we could look ahead, but we can't, so we use 'node' to
 345         * move between three nodes in the state machine.
 346         */
 347         switch (node) {
 348         case 0:
 349             /* Normal idle node */
 350             if (b == ESCAPE_BYTE) {
 351                 node = 1;
 352             }
 353             else {
 354                 target[ai++] = b;
 355             }
 356             break;
 357         case 1:
 358            /* We have seen one ESCAPE_BYTE; we expect either a second
 359             * one, or a run length and value.
 360             */
 361             if (b == ESCAPE_BYTE) {
 362                 target[ai++] = ESCAPE_BYTE;
 363                 node = 0;
 364             }
 365             else {
 366                 runLength = b;
 367                 node = 2;
 368             }
 369             break;
 370         case 2:
 371             {
 372                 int j=0;
 373                /* We have seen an ESCAPE_BYTE and length byte.  We interpret
 374                 * the next byte as the value to be repeated.
 375                 */
 376                 for (; j<runLength; ++j){
 377                     if(ai<tgtLen){
 378                         target[ai++] = b;
 379                     }else{
 380                         *status = U_BUFFER_OVERFLOW_ERROR;
 381                         return ai;
 382                     }
 383                 }
 384                 node = 0;
 385                 break;
 386             }
 387         }
 388     }
 389
 390     if (node != 0){
 391         *status = U_INTERNAL_PROGRAM_ERROR;
 392         /*("Bad run-length encoded byte array")*/
 393         return 0;
 394     }
 395
 396
 397     if (i != srcLen){
 398         /*("Excess data in RLE byte array string");*/
 399         *status = U_INTERNAL_PROGRAM_ERROR;
 400         return ai;
 401     }
 402
 403     return ai;
 404 }
 405