osfmk/arm/WKdmCompress_new.s

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  This file contains armv7 hand optimized implementation of WKdm memory page compressor.
  31
  32         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
  33
  34         input :
  35                 src_buf : address of input page (length = 1024 words)
  36                 dest_buf : address of output buffer (may not be 16-byte aligned)
  37                 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
  38                 bytes_budget : a given byte target in compression
  39
  40         output :
  41
  42                 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
  43                 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
  44                 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
  45                 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
  46                 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
  47
  48  The 4th argument bytes_budget is the target compress budget in bytes.
  49  Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
  50  Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
  51
  52  WKdm Compression algorithm is briefly stated as follows:
  53
  54         There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
  55         For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
  56                 a. k = (x>>10)&255;                                             // 8-bit hash table index
  57                 b. dict_index = hashTable[k];                   // 4-bit dictionary index, hashTable[] is fixed
  58                 c. dict_word = dictionary[dict_index];  // 32-bit dictionary word, dictionary[] is dynamically updated
  59
  60         Each input word x is classified/tagged into 4 classes :
  61                 0 : x = 0
  62                 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
  63                 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
  64                 3 : (x == dict_word), the exact input word is in the dictionary
  65
  66         For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
  67                 0 : 2-bits tag (32->2 compression)
  68                 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
  69                 2 : 2-bits tag + 32-bits new word (32->34 expansion)
  70                 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
  71
  72         It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
  73
  74         the output bit stream (*dest_buf) consists of
  75                 a. 12 bytes header
  76                 b. 256 bytes for 1024 packed tags
  77                 c. (varying number of) words for new words not matched to dictionary word.
  78                 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
  79                 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
  80
  81         the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
  82         Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
  83
  84
  85         The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
  86
  87         The temp buffers are
  88
  89                 uint8_t         tempTagsArray[1024];                    // temporary saving for tags before final packing
  90                 uint8_t         tempQPosArray[1024];                    // temporary saving for dict_indices before final packing
  91                 uint16_t        tempLowBitsArray[1024];                 // temporary saving for partially matched lower 10 bits before final packing
  92
  93         Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
  94         the destination buffer.
  95
  96                 uint32_t        *new_word = dest_buf+3+64;              // 3 words for header, 64 words for tags, new words come right after the tags.
  97
  98         Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
  99
 100         byte_count = bytes_budget - 12 - 256;    // header + tags
 101
 102         whenever an input word is classified as class
 103
 104                 2 : byte_count -= 4;
 105
 106         in 4-bit/10-bit packing, we can also return -1 when byte_budget <=0;
 107
 108         Note : since there might be extra padding bits for class 1 and 3, it is complicated to track this padding bits on the fly. To compromise, we change class 1 to
 109
 110         without showing the bit budget management, the pseudo code is given as follows:
 111
 112         uint8_t         *tags=tempTagsArray;
 113         uint8_t         *dict=tempQPosArray;
 114         uint8_t         *partial=tempLowBitsArray;
 115
 116         for (i=0;i<1024;i++) {
 117                         x = *src_buf++;
 118                         if (x == 0) {           // zero, 2-bits tag
 119                                         *tags++ = 0;
 120                         } else {
 121
 122                                 // find dict_index and dict_word from x
 123                                 k = (x>>10)&255;
 124                                 dict_index = hashTable[k];
 125                                 dict_word = dictionary[dict_index];
 126
 127                                 if (dict_word == x) { // exactly match
 128                                         // 2-bits tag + 4-bits table index
 129                                         *tags++ = 3;
 130                                         *dict++ = dict_index;
 131                                 } else if (((x^dict_word)>>10)==0) {    // 22 higher bits matched
 132                                         // 2-bits tag + 4-bits table index + 10-bits lower partial
 133                                         *tags++ = 1;
 134                     *dict++ = dict_index;
 135                                         *partial++ = x &0x3ff;
 136                                         dictionary[dict_index] = x;
 137                                 } else {        // not matched
 138                                         // 2-bits tag + 32-bits new word
 139                                         *tags++ = 2;
 140                                         *new_word++ = x;
 141                                         dictionary[dict_index] = x;
 142                                 }
 143                         }
 144         }
 145
 146         after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
 147
 148                 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
 149                 2. dictionary indices (4-bits each) are packed into are right after the new words section
 150                 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
 151
 152         cclee, 11/9/12
 153
 154     Added zero page, single value page, sparse page, early abort optimizations
 155     rsrini, 09/14/14
 156 */
 157         .text
 158         .align 4
 159
 160         // int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
 161
 162 .globl _WKdm_compress_new
 163 _WKdm_compress_new:
 164
 165 /*
 166          -------------------------       symbolizing register use          -----------------------------------
 167 */
 168
 169         #define src_buf                         r0
 170         #define next_input_word         r0
 171         #define dest_buf                        r1
 172         #define scratch                         r2
 173         #define dictionary                      sp
 174         #define byte_count                      r3
 175
 176         #define next_tag                        r12
 177
 178         #define remaining                       r4
 179         #define next_full_patt          r5
 180         #define dict_location           r6
 181         #define next_qp                         r8
 182         #define hashTable                       r9
 183         #define next_low_bits           r10
 184         #define eax                                     r11
 185         #define ecx                                     r12
 186         #define edx                                     lr
 187         #define rdi                                     r6
 188
 189     #define tempTagsArray       scratch
 190     #define R11                 r0                      // only safe to use between phase-1 and phase-2
 191     #define R13                 r4                      // only safe to use between phase-1 and phase-2
 192 /*
 193                 -------------------------    allocate scratch memory for local use  --------------------------------------
 194
 195         need 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*2 (tempLowBitsArray)
 196         total 4096 bytes
 197         [scratch,#0] : tempTagsArray
 198         [scratch,#1024] : tempQPosArray
 199         [scratch,#2048] : tempLowBitsArray
 200
 201         [sp,#0] : dictionary
 202
 203 */
 204
 205         #define TagsArray_offset        0
 206         #define QPosArray_offset        1024
 207         #define LowBitsArray_offset     2048
 208
 209     #define SV_RETURN           0                       // return value when SV, ZV page is found
 210     #define MZV_MAGIC           17185                   // magic value used to identify MZV page encoding
 211     #define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
 212     #define CHKPT_WORDS         (CHKPT_BYTES/4)         // checkpoint bytes in words
 213     #define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
 214     #define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
 215                                                         //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
 216 #if CHKPT_BYTES > 4096
 217     #error CHKPT_BYTES must be <= 4096
 218 #endif
 219 #if CHKPT_BYTES < 4
 220     #error CHKPT_BYTES must be >= 4
 221 #endif
 222
 223     push    {r7,lr}
 224     mov     r7, sp
 225     push    {r4-r6,r8-r11}
 226
 227 #if KERNEL
 228         sub             sp, sp, #32
 229         vst1.64 {q0,q1}, [sp]
 230 #endif
 231
 232         sub             sp, sp, #(64+24)                                        // reserve stack space for temps + dictionary
 233
 234 /*
 235                 ----- set up registers and initialize WKdm dictionary ----------
 236 */
 237                                             // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
 238                                             // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
 239         mov             eax, #0
 240
 241         mov             next_tag, scratch                               // &tempTagsArray[0]
 242         vdup.32 q0, eax
 243
 244         add             next_qp, scratch, #QPosArray_offset             // next_qp
 245         mov             lr, sp
 246         mov             remaining, #(CHKPT_WORDS)       // remaining input words .. initially set to checkpoint
 247         vst1.64 {q0}, [lr]!
 248         add             next_full_patt, dest_buf, #268  // dest_buf + [TAGS_AREA_OFFSET + (4096 / 16)]*4
 249         vst1.64 {q0}, [lr]!
 250         vst1.64 {q0}, [lr]!
 251         add             next_low_bits, scratch, #LowBitsArray_offset    // &tempLowBitsArray[0]
 252         vst1.64 {q0}, [lr]!
 253
 254 #if defined(KERNEL) && !SLIDABLE
 255     adr     hashTable, L_table
 256     ldr     hashTable, [hashTable]
 257 #else
 258     ldr     hashTable, L_table
 259 L_table0:
 260     ldr     hashTable, [pc, hashTable]
 261 #endif
 262
 263 #define EARLYCHECK              0
 264 #define NORMAL                  1
 265
 266 #define mode                    [sp, #64]
 267 #define start_next_full_patt    [sp, #68]
 268 #define start_next_input_word   [sp, #72]
 269 #define start_next_low_bits     [sp, #76]
 270 #define byte_budget             [sp, #80]
 271
 272     mov     edx, #EARLYCHECK
 273     str     edx, mode                               // indicate we are yet to evaluate the early aborts
 274     str     next_full_patt, start_next_full_patt    // remember the start of next_full_patt
 275     str     next_input_word, start_next_input_word  // remember the start of next_input_word
 276     str     next_low_bits, start_next_low_bits      // remember the start of next_low_bits
 277     str     byte_count, byte_budget                 // remember the byte budget
 278
 279         sub             byte_count, byte_count, #(12+256)       // byte_count - header bytes - tags bytes
 280         b               L_scan_loop
 281
 282         .align  4, 0x90
 283 L_RECORD_ZERO:
 284         /* we've just detected a zero input word in edx */
 285         strb    edx, [next_tag], #1                                             // *next_tag++ = ZERO;
 286         subs    remaining, remaining, #1                                // remaining input words
 287         ble             CHECKPOINT                              // if remaining = 0, break
 288
 289         /* WKdm compress scan/tag loop */
 290 L_scan_loop:
 291         ldr             edx, [next_input_word], #4
 292         cmp             edx, #0
 293         beq             L_RECORD_ZERO                                                   // if (input_word==0) RECORD_ZERO
 294
 295         /*
 296                 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
 297         */
 298         and             eax, edx, #(0xff<<10)                                   // part of input_word for hash table index
 299         lsr             eax, eax, #10                                                   // 8-bit index to the Hash Table
 300         ldrb    eax, [hashTable, eax]                                   // HASH_TO_DICT_BYTE_OFFSET(input_word)
 301         add             dict_location, dictionary, eax                  // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
 302         ldr             eax, [dictionary, eax]                                  // dict_word = *dict_location;
 303         cmp             eax, edx                                                                // dict_word vs input_word
 304         beq             L_RECORD_EXACT                                                  // if identical, RECORD_EXACT
 305
 306         eor             eax, eax, edx
 307         lsrs    eax, eax, #10                                                   // HIGH_BITS(dict_word)
 308         beq             L_RECORD_PARTIAL                                                // if identical, RECORD_PARTIAL
 309
 310 L_RECORD_MISS:
 311 /*
 312         if we are here, the input word can not be derived from the dictionary,
 313         we write the input word as a new word,
 314         and update the dictionary with this new word
 315 */
 316
 317         subs    byte_count, byte_count, #4
 318         ble             L_budgetExhausted                                               // o.w., return -1 to signal this page is not compressable
 319         str             edx, [next_full_patt], #4                               // *next_full_patt++ = input_word;
 320         mov             eax, #2
 321         str             edx, [dict_location]                                    // *dict_location = input_word
 322         strb    eax, [next_tag], #1                                             // *next_tag++ = 2 for miss
 323         subs    remaining, remaining, #1                                // remaining input words
 324         bgt             L_scan_loop                                                             // if bit_count>0, go on the scan/tag pass,
 325     b       CHECKPOINT
 326
 327 L_done_search:
 328
 329         // SET_QPOS_AREA_START(dest_buf,next_full_patt);
 330         sub             eax, next_full_patt, dest_buf                   // next_full_patt - dest_buf
 331         lsr             eax, eax, #2                                                    // offset in 4-bytes
 332         str             eax, [dest_buf]                                                 // dest_buf[0] = next_full_patt - dest_buf
 333
 334
 335         /* --------------------------     packing 1024 tags into 256 bytes ----------------------------------------*/
 336         // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
 337
 338         add             rdi, dest_buf, #12                                              // dest_buf
 339         mov             eax, scratch                                                    // &tempTagsArray[0]
 340         sub             edx, next_tag, scratch                                  // this should be 1024
 341
 342         vld1.64 {q0,q1}, [eax,:128]!
 343         subs    edx, edx, #32                           // pre-decrement by 32
 344 L_pack_2bits:
 345         subs    edx, edx, #32
 346         vshl.i64        d1, d1, #4
 347         vshl.i64        d3, d3, #4
 348         vorr    d0, d0, d1
 349         vorr    d2, d2, d3
 350         vshr.u64        d1, d0, #30
 351         vshr.u64        d3, d2, #30
 352         vorr    d0, d0, d1
 353         vorr    d2, d2, d3
 354         vzip.32 d0, d2
 355         vst1.64 {d0}, [rdi]!
 356         vld1.64 {q0,q1}, [eax,:128]!
 357         bgt             L_pack_2bits
 358         vshl.i64        d1, d1, #4
 359         vshl.i64        d3, d3, #4
 360         vorr    d0, d0, d1
 361         vorr    d2, d2, d3
 362         vshr.u64        d1, d0, #30
 363         vshr.u64        d3, d2, #30
 364         vorr    d0, d0, d1
 365         vorr    d2, d2, d3
 366         vzip.32 d0, d2
 367         vst1.64 {d0}, [rdi]
 368
 369
 370         /* ---------------------------------      packing 4-bits dict indices into dest_buf ----------------------------------   */
 371
 372         /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
 373         add             ecx, scratch, #QPosArray_offset                 // tempQPosArray
 374         sub             eax, next_qp, ecx                                               // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
 375         add             eax, eax, #7                                                    // num_bytes_to_pack+7
 376         lsr             eax, eax, #3                                                    // num_packed_words = (num_bytes_to_pack + 7) >> 3
 377         subs    byte_count, byte_count, eax, lsl #2             // byte_count -= 4 * packed_words
 378         blt             L_budgetExhausted                                               // o.w., return -1 to signal this page is not compressable
 379         add             ecx, ecx, eax, lsl #3                                   // endQPosArray = tempQPosArray + 2*num_source_words
 380         cmp             ecx, next_qp                                                    // endQPosArray vs next_qp
 381         bls             L16                                                                             // if (next_qp >= endQPosArray) skip the following zero paddings
 382         sub             eax, ecx, next_qp
 383         mov             edx, #0
 384         tst             eax, #4
 385         beq             1f
 386         str             edx, [next_qp], #4
 387 1:      tst             eax, #2
 388         beq             1f
 389         strh    edx, [next_qp], #2
 390 1:      tst             eax, #1
 391         beq             1f
 392         strb    edx, [next_qp], #1
 393 1:
 394 L16:
 395         add             edx, scratch, #QPosArray_offset                 // tempQPosArray
 396         mov             rdi, next_full_patt                                             // next_full_patt
 397         cmp             ecx, edx                                                                // endQPosArray vs tempQPosArray
 398         ldr             eax, [dest_buf]
 399         bls             L20                                                                             // if (endQPosArray <= tempQPosArray) skip the following
 400
 401         /* packing 4-bits dict indices into dest_buf */
 402 L_pack_4bits:
 403         vld1.64 {d0}, [edx,:64]!                                                        // src_next[1]:src_next[0]
 404         vshr.u64        d1, d0, #28                                                     // (src_next[1] << 4)
 405         vorr    d0, d0, d1                                                              // src_next[0] | (src_next[1] << 4)
 406         cmp             ecx, edx                                                                // source_end vs src_next
 407         vstr    s0, [rdi]
 408         add             rdi, rdi, #4
 409         bhi             L_pack_4bits                                                    // while (src_next < source_end) repeat the loop
 410
 411         /*  --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[]   ----------------------------------------- */
 412         // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
 413         sub             eax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 414         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in words
 415 L20:
 416         str             eax, [dest_buf,#4]                                              // dest_buf[1] = boundary_tmp - dest_buf
 417
 418         add             ecx, scratch, #LowBitsArray_offset              // tempLowBitsArray
 419     sub         edx, next_low_bits, ecx                                 // next_low_bits - tempLowBitsArray (in bytes)
 420         lsr             edx, edx, #1                                                    // num_tenbits_to_pack (in half-words)
 421         subs    edx, edx, #3                                                    // pre-decrement num_tenbits_to_pack by 3
 422         blt             1f                                                                              // if num_tenbits_to_pack < 3, skip the following loop
 423 0:
 424         subs    byte_count, byte_count, #4                              // byte_count -= 4
 425         ble             L_budgetExhausted                                               // o.w., return -1 to signal this page is not compressable
 426         ldr             r4,[ecx, #2]                                                    // w2:6bits:w1
 427         ldrh    r0,[ecx], #6                                                    // w0
 428         uxth    r5, r4, ror #16                                                 // w2
 429         uxth    r4, r4                                                                  // w1
 430         orr             r0, r0, r4, lsl #10                                             // w1:w0
 431         subs    edx, edx, #3                                                    // num_tenbits_to_pack-=3
 432         orr             r0, r0, r5, lsl #20                                             // w2:w1:w0
 433         str             r0, [rdi], #4                                                   // pack w0,w1,w2 into 1 dest_buf word
 434         bge             0b                                                                              // if no less than 3 elements, back to loop head
 435
 436 1:      adds    edx, edx, #3                                                    // post-increment num_tenbits_to_pack by 3
 437         beq             3f                                                                              // if num_tenbits_to_pack is a multiple of 3, skip the following
 438         subs    byte_count, byte_count, #4                              // byte_count -= 4
 439         ble             L_budgetExhausted                                               // o.w., return -1 to signal this page is not compressable
 440         ldrh    eax,[ecx]                                                               // w0
 441         subs    edx, edx, #1                                                    // num_tenbits_to_pack--
 442         beq             2f                                                                              //
 443         ldrh    edx, [ecx, #2]                                                  // w1
 444         orr             eax, eax, edx, lsl #10                                  // w0 | (w1<<10)
 445
 446 2:      str             eax, [rdi], #4                                                  // write the final dest_buf word
 447
 448 3:      sub             eax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 449         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in terms of words
 450         str             eax, [dest_buf, #8]                                             // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
 451         lsl             r0, eax, #2                                                             // boundary_tmp - dest_buf in terms of bytes
 452
 453 L_done:
 454         // restore registers and return
 455
 456         add             sp, sp, #(64+24)                                    // skip memory for temps + dictionary
 457 #if KERNEL
 458         vld1.64 {q0,q1}, [sp]!
 459 #endif
 460     pop     {r4-r6,r8-r11}
 461     pop     {r7,pc}
 462
 463         .align  4
 464 L_budgetExhausted:
 465         mov             r0, #-1
 466         b               L_done
 467
 468
 469         .align 4,0x90
 470 L_RECORD_EXACT:
 471 /*
 472                 we have an exact match of the input word to its corresponding dictionary word
 473                 write tag/dict_index to the temorary buffers
 474 */
 475         sub             edx, dict_location, dictionary          // dict_location - dictionary
 476         mov             eax, #3
 477         lsr             edx, edx, #2                                            // divide by 4 for word offset
 478         strb    eax, [next_tag], #1                                     // *next_tag++ = 3 for exact
 479         strb    edx, [next_qp], #1                                      // *next_qp = word offset (4-bit)
 480         subs    remaining, remaining, #1                        // remaining input words
 481         bgt             L_scan_loop                                                     // if remaining>0, go on the scan/tag pass,
 482         b               CHECKPOINT                          // if remaining = 0, break
 483
 484         .align 4,0x90
 485 L_RECORD_PARTIAL:
 486 /*
 487                 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
 488                 write tag/dict_index/low 10 bits to the temorary buffers
 489 */
 490         sub             eax, dict_location, dictionary          // dict_location - dictionary
 491         str             edx, [dict_location]                            // *dict_location = input_word;
 492         lsr             eax, eax, #2                                            // offset in 32-bit word
 493         lsl             edx, edx, #22
 494         strb    eax, [next_qp], #1                                      // update *next_qp++
 495         mov             eax, #1
 496         lsr             edx, edx, #22                                           // lower 10 bits
 497         strb    eax, [next_tag], #1                                     // *next_tag++ = 1 for partial matched
 498         strh    edx, [next_low_bits], #2                        // save next_low_bits++
 499         subs    remaining, remaining, #1                        // remaining input words
 500         bgt             L_scan_loop                                                     // if remaining>0, go on the scan/tag pass,
 501
 502 CHECKPOINT:
 503     ldr     eax, mode                                   // load the mode
 504     cmp     eax, #EARLYCHECK
 505     beq     L_check_compression_ratio                   // early abort check
 506
 507 L_check_zero_page:
 508
 509     ldr     eax, start_next_full_patt                   // check if any dictionary misses in page
 510     cmp     eax, next_full_patt
 511     bne     L_check_single_value_page
 512
 513         add             eax, scratch, #QPosArray_offset                 // get start_next_qp
 514     cmp     eax, next_qp                                // check if any partial or exact dictionary matches
 515
 516     moveq   r0, #SV_RETURN                              // Magic return value
 517         beq     L_done
 518
 519 L_check_single_value_page:
 520
 521     ldr     eax, start_next_full_patt                   // get # dictionary misses
 522     sub     eax, next_full_patt, eax
 523     lsr     eax, eax, #2
 524
 525         add             R11, scratch, #QPosArray_offset                 // get start_next_qp
 526     sub     R11, next_qp, R11                           // get # dictionary hits (exact + partial)
 527
 528     ldr     R13, start_next_low_bits
 529     sub     R13, next_low_bits, R13                     // get # dictionary partial hits
 530     lsrs    R13, R13, #1
 531
 532     // Single value page if one of the follwoing is true:
 533     //  partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
 534     //  partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
 535     //
 536     bne     1f                                          // were there 0 partial hits?
 537
 538     mov     edx, #1023
 539     cmp     R11, edx                                    // were there 1023 dictionary hits
 540     bne     1f
 541
 542     cmp     eax, #1                                     // was there exacly 1 dictionary miss?
 543     bne     1f
 544
 545     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 546     cmp     edx, #2                                     // was the very 1st tag a miss?
 547     beq     L_is_single_value_page
 548
 549 1:
 550     cmp     R13, #1                                     // was there 1 partial hit?
 551     bne     L_check_mostly_zero
 552
 553     mov     edx, #1024
 554     cmp     R11, edx                                    // were there 1024 dictionary hits
 555     bne     L_check_mostly_zero
 556
 557     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 558     cmp     edx, #1                                     // was the very 1st tag a partial?
 559     bne     L_is_single_value_page
 560
 561 L_is_single_value_page:
 562
 563     moveq   r0, #SV_RETURN                              // Magic return value
 564         beq     L_done
 565
 566 L_check_mostly_zero:
 567                                                         // how much space will the sparse packer take?
 568     add     eax, eax, R11                               // eax += (next_qp - start_next_qp)
 569     mov     edx, #6
 570     mov     R11, #4
 571     mla     R11, eax, edx, R11                          // R11 = eax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
 572
 573     ldr     eax, start_next_low_bits
 574     sub     eax, next_low_bits, eax                     // get bytes consumed by lower-10 bits
 575     mov     edx, #1365
 576     mul     eax, eax, edx
 577
 578     ldr     edx, start_next_full_patt
 579     sub     edx, next_full_patt, edx                    // get bytes consumed by dictionary misses
 580     add     eax, edx, eax, lsr #11                      // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 581
 582         add             edx, scratch, #QPosArray_offset                 // get start_next_qp
 583     sub     edx, next_qp, edx
 584     add     eax, eax, edx, lsr #1                       // eax += (next_qp - start_next_qp)/2
 585     mov     edx, #(12+256)
 586     add     eax, eax, edx                               // rax += bytes taken by the header + tags
 587
 588     cmp     eax, R11                                    // is the default packer the better option?
 589     blt     L_done_search
 590
 591     ldr     edx, byte_budget
 592     cmp     R11, edx                                    // can the sparse packer fit into the given budget?
 593     bgt     L_budgetExhausted
 594
 595 L_sparse_packer:
 596
 597     mov     edx, #MZV_MAGIC
 598     str     edx, [dest_buf], #4                         // header to indicate a sparse packer
 599
 600     ldr     R13, start_next_input_word                  // get the starting address of src
 601     mov     edx, #0
 602     mov     ecx, #4096
 603
 604 1:
 605     ldm     R13!, {r2, r3, r5, r6, r7, r8, r9, r10}
 606
 607     teq     r2, #0
 608     teqeq   r3, #0
 609     teqeq   r5, #0
 610     teqeq   r6, #0
 611     teqeq   r7, #0
 612     teqeq   r8, #0
 613     teqeq   r9, #0
 614     teqeq   r10, #0
 615
 616     bne     2f
 617     subs    ecx, ecx, #32
 618     add     edx, edx, #32                               // 16 more bytes have been processed
 619     bne     1b
 620     mov     r0, R11                                     // store the size of the compressed stream
 621     b       L_done
 622
 623 2:
 624     teq     r2, #0
 625     strne   r2, [dest_buf], #4                         // store the non-0 word in the dest buffer
 626     strhne  edx, [dest_buf], #2                        // store the byte index
 627     add     edx, edx, 4
 628
 629     teq     r3, #0
 630     strne   r3, [dest_buf], #4                         // store the non-0 word in the dest buffer
 631     strhne  edx, [dest_buf], #2                        // store the byte index
 632     add     edx, edx, 4
 633
 634     teq     r5, #0
 635     strne   r5, [dest_buf], #4                         // store the non-0 word in the dest buffer
 636     strhne  edx, [dest_buf], #2                        // store the byte index
 637     add     edx, edx, 4
 638
 639     teq     r6, #0
 640     strne   r6, [dest_buf], #4                         // store the non-0 word in the dest buffer
 641     strhne  edx, [dest_buf], #2                        // store the byte index
 642     add     edx, edx, 4
 643
 644     teq     r7, #0
 645     strne   r7, [dest_buf], #4                         // store the non-0 word in the dest buffer
 646     strhne  edx, [dest_buf], #2                        // store the byte index
 647     add     edx, edx, 4
 648
 649     teq     r8, #0
 650     strne   r8, [dest_buf], #4                         // store the non-0 word in the dest buffer
 651     strhne  edx, [dest_buf], #2                        // store the byte index
 652     add     edx, edx, 4
 653
 654     teq     r9, #0
 655     strne   r9, [dest_buf], #4                         // store the non-0 word in the dest buffer
 656     strhne  edx, [dest_buf], #2                        // store the byte index
 657     add     edx, edx, 4
 658
 659     teq     r10, #0
 660     strne   r10, [dest_buf], #4                        // store the non-0 word in the dest buffer
 661     strhne  edx, [dest_buf], #2                        // store the byte index
 662     add     edx, edx, 4
 663
 664     subs    ecx, ecx, #32
 665     bne     1b
 666     mov     r0, R11                                     // store the size of the compressed stream
 667     b       L_done
 668
 669 L_check_compression_ratio:
 670
 671     mov     eax, #NORMAL
 672     str     eax, mode
 673     mov     remaining, #(1024 - CHKPT_WORDS)            // remaining input words to process
 674     cmp     remaining, #0
 675     beq     CHECKPOINT                                  // if there are no remaining words to process
 676
 677     ldr     eax, start_next_low_bits
 678     sub     eax, next_low_bits, eax                     // get bytes consumed by lower-10 bits
 679     mov     edx, #1365
 680     mul     eax, eax, edx
 681
 682     ldr     edx, start_next_full_patt
 683     sub     edx, next_full_patt, edx                    // get bytes consumed by dictionary misses
 684     add     eax, edx, eax, lsr #11                      // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 685
 686         add             edx, scratch, #QPosArray_offset                 // get start_next_qp
 687     sub     edx, next_qp, edx
 688     add     eax, eax, edx, lsr #1                       // eax += (next_qp - start_next_qp)/2
 689     mov     edx, #(CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)
 690     subs    eax, eax, edx                               // eax += CHKPT_TAG_BYTES; eax -= CHKPT_SHRUNK_BYTES
 691     bgt     L_budgetExhausted                           // if eax is > 0, we need to early abort
 692     b       L_scan_loop                                 // we are done
 693
 694
 695 #if defined(KERNEL) && !SLIDABLE
 696     .align  2
 697 L_table:
 698     .long   _hashLookupTable_new
 699 #else
 700         .align  2
 701 L_table:
 702         .long   L_Tab$non_lazy_ptr-(L_table0+8)
 703
 704          .section    __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
 705     .align  2
 706 L_Tab$non_lazy_ptr:
 707     .indirect_symbol    _hashLookupTable_new
 708     .long   0
 709 #endif
 710