osfmk/x86_64/WKdmCompress_new.s

   1 /*
   2  * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  This file contains x86_64 hand optimized implementation of WKdm memory page compressor.
  31
  32         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
  33
  34         input :
  35                 src_buf : address of input page (length = 1024 words)
  36                 dest_buf : address of output buffer (may not be 16-byte aligned)
  37                 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
  38                 bytes_budget : a given byte target in compression
  39
  40         output :
  41
  42                 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
  43                 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
  44                 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
  45                 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
  46                 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
  47
  48  The 4th argument bytes_budget is the target compress budget in bytes.
  49  Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
  50  Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
  51
  52  WKdm Compression algorithm is briefly stated as follows:
  53
  54         There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
  55         For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
  56                 a. k = (x>>10)&255;                                             // 8-bit hash table index
  57                 b. dict_index = hashTable[k];                   // 4-bit dictionary index, hashTable[] is fixed
  58                 c. dict_word = dictionary[dict_index];  // 32-bit dictionary word, dictionary[] is dynamically updated
  59
  60         Each input word x is classified/tagged into 4 classes :
  61                 0 : x = 0
  62                 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
  63                 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
  64                 3 : (x == dict_word), the exact input word is in the dictionary
  65
  66         For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
  67                 0 : 2-bits tag (32->2 compression)
  68                 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
  69                 2 : 2-bits tag + 32-bits new word (32->34 expansion)
  70                 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
  71
  72         It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
  73
  74         the output bit stream (*dest_buf) consists of
  75                 a. 12 bytes header
  76                 b. 256 bytes for 1024 packed tags
  77                 c. (varying number of) words for new words not matched to dictionary word.
  78                 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
  79                 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
  80
  81         the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
  82         Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
  83
  84
  85         The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
  86
  87         The temp buffers are
  88
  89                 uint8_t         tempTagsArray[1024];                    // temporary saving for tags before final packing
  90                 uint8_t         tempQPosArray[1024];                    // temporary saving for dict_indices before final packing
  91                 uint16_t        tempLowBitsArray[1024];                 // temporary saving for partially matched lower 10 bits before final packing
  92
  93         Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
  94         the destination buffer.
  95
  96                 uint32_t        *new_word = dest_buf+3+64;              // 3 words for header, 64 words for tags, new words come right after the tags.
  97
  98         Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
  99
 100         bytes_budget -= 12 + 256; // header and tags (1024 * 2 /8 = 256 bytes)
 101
 102         whenever an input word is classified as class
 103
 104                 2 : bytes_budget-=4; if (bytes_budget<=0) exit -1;
 105
 106         when writing the 8 4-bits/3 10-bits, monitor bytes_budget and exit -1 when byte_budget <=0;
 107
 108         without showing the bit budget management, the pseudo code is given as follows:
 109
 110         uint8_t         *tags=tempTagsArray;
 111         uint8_t         *dict=tempQPosArray;
 112         uint8_t         *partial=tempLowBitsArray;
 113
 114         for (i=0;i<1024;i++) {
 115                         x = *src_buf++;
 116                         if (x == 0) {           // zero, 2-bits tag
 117                                         *tags++ = 0;
 118                         } else {
 119
 120                                 // find dict_index and dict_word from x
 121                                 k = (x>>10)&255;
 122                                 dict_index = hashTable[k];
 123                                 dict_word = dictionary[dict_index];
 124
 125                                 if (dict_word == x) { // exactly match
 126                                         // 2-bits tag + 4-bits table index
 127                                         *tags++ = 3;
 128                                         *dict++ = dict_index;
 129                                 } else if (((x^dict_word)>>10)==0) {    // 22 higher bits matched
 130                                         // 2-bits tag + 4-bits table index + 10-bits lower partial
 131                                         *tags++ = 1;
 132                     *dict++ = dict_index;
 133                                         *partial++ = x &0x3ff;
 134                                         dictionary[dict_index] = x;
 135                                 } else {        // not matched
 136                                         // 2-bits tag + 32-bits new word
 137                                         *tags++ = 2;
 138                                         *new_word++ = x;
 139                                         dictionary[dict_index] = x;
 140                                 }
 141                         }
 142         }
 143
 144         after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
 145
 146                 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
 147                 2. dictionary indices (4-bits each) are packed into are right after the new words section
 148                 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
 149
 150         cclee, 11/30/12
 151
 152     Added zero page, single value page, sparse page, early abort optimizations
 153     rsrini, 09/14/14
 154
 155 */
 156
 157         .text
 158         .align 4,0x90
 159
 160 #define SV_RETURN           $0                      // return value when SV, ZV page is found
 161 #define MZV_MAGIC           $17185                  // magic value used to identify MZV page encoding
 162 #define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
 163 #define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
 164 #define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
 165                                                     //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
 166
 167 #if CHKPT_BYTES > 4096
 168     #error CHKPT_BYTES must be <= 4096
 169 #endif
 170 #if CHKPT_BYTES < 4
 171     #error CHKPT_BYTES must be >= 4
 172 #endif
 173
 174 .globl _WKdm_compress_new
 175 _WKdm_compress_new:
 176         pushq   %rbp
 177         movq    %rsp, %rbp
 178         pushq   %r15
 179         pushq   %r14
 180         pushq   %r13
 181         pushq   %r12
 182         pushq   %rbx
 183         subq    $(48+64), %rsp
 184
 185         #define tempTagsArray       64(%rsp)
 186         #define tempLowBitsArray        72(%rsp)
 187
 188     #define start_next_full_patt  80(%rsp)
 189     #define start_next_input_word 88(%rsp)
 190     #define byte_budget           96(%rsp)
 191     #define start_next_qp         tempQPosArray
 192     #define start_next_low_bits   tempLowBitsArray
 193
 194         #define next_tag                        %r8
 195         #define next_input_word         %rdi
 196         #define end_of_input            %r13
 197         #define next_full_patt          %rbx
 198         #define dict_location           %rcx
 199         #define next_qp                         %r10
 200     #define checkpoint          %r11
 201         #define dictionary                      %rsp
 202         #define dest_buf                        %r12
 203         #define hashTable                       %r14
 204         #define tempQPosArray           %r15
 205         #define next_low_bits           %rsi
 206         #define byte_count                      %r9d
 207
 208         movq    %rsi, %r12                                              // dest_buf
 209
 210         movq    %rdx, tempTagsArray                     // &tempTagsArray[0]
 211         movq    %rdx, next_tag                                  // next_tag always points to the one following the current tag
 212
 213         leaq    1024(%rdx), tempQPosArray               // &tempQPosArray[0]
 214         movq    tempQPosArray, next_qp                  // next_qp
 215
 216     leaq    CHKPT_BYTES(%rdi), checkpoint   // checkpoint = src_buf + CHKPT_BYTES
 217         leaq    4096(%rdi), end_of_input                // end_of_input = src_buf + num_input_words
 218         leaq    268(%rsi), %rbx                                 // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
 219
 220         movl    %ecx, byte_count
 221         subl    $(12+256), byte_count                   // header + tags
 222         jle             L_budgetExhausted
 223
 224                                             // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
 225                                             // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
 226         // PRELOAD_DICTIONARY;
 227         movl    $0, 0(dictionary)
 228         movl    $0, 4(dictionary)
 229         movl    $0, 8(dictionary)
 230         movl    $0, 12(dictionary)
 231         movl    $0, 16(dictionary)
 232         movl    $0, 20(dictionary)
 233         movl    $0, 24(dictionary)
 234         movl    $0, 28(dictionary)
 235         movl    $0, 32(dictionary)
 236         movl    $0, 36(dictionary)
 237         movl    $0, 40(dictionary)
 238         movl    $0, 44(dictionary)
 239         movl    $0, 48(dictionary)
 240         movl    $0, 52(dictionary)
 241         movl    $0, 56(dictionary)
 242         movl    $0, 60(dictionary)
 243
 244         leaq    2048(%rdx), %rax                                // &tempLowBitsArray[0]
 245         movq    %rax, tempLowBitsArray                  // save for later reference
 246         movq    %rax, next_low_bits                             // next_low_bits
 247
 248         leaq    _hashLookupTable_new(%rip), hashTable   // hash look up table
 249
 250     movq    next_full_patt, start_next_full_patt
 251     movq    next_input_word, start_next_input_word
 252     movl    %ecx, byte_budget               // save the byte budget
 253
 254
 255         jmp             L_scan_loop
 256
 257         .align 4,0x90
 258 L_RECORD_ZERO:
 259         movb    $0, -1(next_tag)                                                // *next_tag = ZERO;
 260         addq    $4, next_input_word                                     // next_input_word++;
 261         cmpq    next_input_word, checkpoint             // checkpoint time?
 262         je              CHECKPOINT
 263
 264 L_scan_loop:
 265         movl    (next_input_word), %edx
 266         incq    next_tag                                                                // next_tag++
 267         testl   %edx, %edx
 268         je              L_RECORD_ZERO                                                   // if (input_word==0) RECORD_ZERO
 269         movl    %edx, %eax                                                              // a copy of input_word
 270         shrl    $10, %eax                                                               // input_high_bits = HIGH_BITS(input_word);
 271         movzbl  %al, %eax                                                               // 8-bit index to the Hash Table
 272         movsbq  (hashTable,%rax),%rax                                   // HASH_TO_DICT_BYTE_OFFSET(input_word)
 273         leaq    (dictionary, %rax), dict_location               // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
 274         movl    (dict_location), %eax                                   // dict_word = *dict_location;
 275         addq    $4, next_input_word                                             // next_input_word++
 276         cmpl    %eax, %edx                                                              // dict_word vs input_word
 277         je              L_RECORD_EXACT                                                  // if identical, RECORD_EXACT
 278         xorl    %edx, %eax
 279         shrl    $10, %eax                                                               // HIGH_BITS(dict_word)
 280         je              L_RECORD_PARTIAL                                                // if identical, RECORD_PARTIAL
 281
 282 L_RECORD_MISS:
 283         movl    %edx, (next_full_patt)                                  // *next_full_patt = input_word;
 284         addq    $4, next_full_patt                                              // next_full_patt++
 285         movl    %edx, (dict_location)                                   // *dict_location = input_word
 286         movb    $2, -1(next_tag)                                                // *next_tag = 2 for miss
 287         subl    $4, byte_count                                                  // fill in a new 4-bytes word
 288         jle             L_budgetExhausted
 289         cmpq    next_input_word, checkpoint             // checkpoint time?
 290         jne     L_scan_loop
 291         jmp         CHECKPOINT
 292
 293 L_done_search:
 294
 295         // SET_QPOS_AREA_START(dest_buf,next_full_patt);
 296         movq    next_full_patt, %rax                                    // next_full_patt
 297         subq    dest_buf, %rax                                                  // next_full_patt - dest_buf
 298         sarq    $2, %rax                                                                // offset in 4-bytes
 299         movl    %eax, %r13d                                                             // r13d = (next_full_patt - dest_buf)
 300         movl    %eax, 0(dest_buf)                                               // dest_buf[0] = next_full_patt - dest_buf
 301         decq    next_tag
 302         cmpq    next_tag, tempTagsArray                                 // &tempTagsArray[0] vs next_tag
 303         jae             L13                                                                             // if (&tempTagsArray[0] >= next_tag), skip the following
 304
 305         // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
 306
 307         movq    dest_buf, %rdi                                                  // dest_buf
 308         movq    tempTagsArray, %rcx                                             // &tempTagsArray[0]
 309
 310         .align 4,0x90
 311 L_pack_2bits:
 312         movq    8(%rcx), %rax                                                   // w3
 313         addq    $16, %rcx                                                               // tempTagsArray += 16;
 314         shlq    $4, %rax
 315         addq    $4, %rdi                                                                // dest_buf += 4;
 316         orq             -16(%rcx), %rax                                                 // w3
 317         movq    %rax, %rdx
 318         shrq    $30, %rax
 319         orl             %edx, %eax
 320         cmpq    %rcx, next_tag                                                  // cmp next_tag vs dest_buf
 321         movl    %eax, 8(%rdi)                                                   // save at *(dest_buf + HEADER_SIZE_IN_WORDS)
 322         ja              L_pack_2bits                                                    // if (next_tag > dest_buf) repeat L_pack_2bits
 323
 324         /* Pack the queue positions into the area just after the full words. */
 325
 326 L13:
 327         mov             next_qp, %rax                                                   // next_qp
 328         sub             tempQPosArray, %rax                                             // num_bytes_to_pack = next_qp - (char *) tempQPosArray;
 329         addl    $7, %eax                                                                // num_bytes_to_pack+7
 330         shrl    $3, %eax                                                                // num_packed_words = (num_bytes_to_pack + 7) >> 3
 331
 332         shll    $2, %eax                                                                // turn into bytes
 333         subl    %eax, byte_count                                                //
 334         jl              L_budgetExhausted
 335         shrl    $1, %eax                                                                // num_source_words = num_packed_words * 2;
 336
 337         leaq    (tempQPosArray,%rax,4), %rcx                    // endQPosArray = tempQPosArray + num_source_words
 338         cmpq    %rcx, next_qp                                                   // next_qp vs endQPosArray
 339         jae             L16                                                                             // if (next_qp >= endQPosArray) skip the following zero paddings
 340         movq    %rcx, %rax
 341         subq    next_qp, %rax
 342         subl    $4, %eax
 343         jl              1f
 344         .align 4,0x90
 345 0:      movl    $0, (next_qp)
 346         addq    $4, next_qp
 347         subl    $4, %eax
 348         jge             0b
 349 1:      testl   $2, %eax
 350         je              1f
 351         movw    $0, (next_qp)
 352         addq    $2, next_qp
 353 1:      testl   $1, %eax
 354         je              1f
 355         movb    $0, (next_qp)
 356         addq    $1, next_qp
 357 1:
 358 L16:
 359         movq    next_full_patt, %rdi                                    // next_full_patt
 360         cmpq    tempQPosArray, %rcx                                             // endQPosArray vs tempQPosArray
 361         jbe             L20                                                                             // if (endQPosArray <= tempQPosArray) skip the following
 362         movq    tempQPosArray, %rdx                                             // tempQPosArray
 363
 364         /* byte_count -= (rcx - tempQPosArray)/2 */
 365
 366         .align 4,0x90
 367 L_pack_4bits:
 368         movl    4(%rdx), %eax                                                   // src_next[1]
 369         addq    $8, %rdx                                                                // src_next += 2;
 370         sall    $4, %eax                                                                // (src_next[1] << 4)
 371         addq    $4, %rdi                                                                // dest_next++;
 372         orl             -8(%rdx), %eax                                                  // temp = src_next[0] | (src_next[1] << 4)
 373         cmpq    %rdx, %rcx                                                              // source_end vs src_next
 374         movl    %eax, -4(%rdi)                                                  // dest_next[0] = temp;
 375         ja              L_pack_4bits                                                    // while (src_next < source_end) repeat the loop
 376
 377         // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
 378         movq    %rdi, %rax                                                              // boundary_tmp
 379         subq    dest_buf, %rax                                                  // boundary_tmp - dest_buf
 380         movq    %rax, %r13                                                              // boundary_tmp - dest_buf
 381         shrq    $2, %r13                                                                // boundary_tmp - dest_buf in words
 382 L20:
 383         movl    %r13d, 4(dest_buf)                                              // dest_buf[1] = boundary_tmp - dest_buf
 384
 385         movq    tempLowBitsArray, %rcx                                  // tempLowBitsArray
 386         movq    next_low_bits, %rbx                                             // next_low_bits
 387         subq    %rcx, %rbx                                                              // next_low_bits - tempLowBitsArray (in bytes)
 388         sarq    $1, %rbx                                                                // num_tenbits_to_pack (in half-words)
 389
 390         #define size    %ebx
 391
 392         subl    $3, size                                                                // pre-decrement num_tenbits_to_pack by 3
 393         jl              1f                                                                              // if num_tenbits_to_pack < 3, skip the following loop
 394
 395         .align  4,0x90
 396 0:
 397         movzwl  4(%rcx), %eax                                                   // w2
 398         addq    $6, %rcx                                                                // next w0/w1/w2 triplet
 399         sall    $10, %eax                                                               // w1 << 10
 400         or              -4(%rcx), %ax                                                   // w1
 401         addq    $4, %rdi                                                                // dest_buf++
 402         sall    $10, %eax                                                               // w1 << 10
 403         or              -6(%rcx), %ax                                                   // (w0) | (w1<<10) | (w2<<20)
 404         subl    $4, byte_count                                                  // fill in a new 4-bytes word
 405         jle             L_budgetExhausted
 406         subl    $3, size                                                                // num_tenbits_to_pack-=3
 407         movl    %eax, -4(%rdi)                                                  // pack w0,w1,w2 into 1 dest_buf word
 408         jge             0b                                                                              // if no less than 3 elements, back to loop head
 409
 410 1:      addl    $3, size                                                                // post-increment num_tenbits_to_pack by 3
 411         je              3f                                                                              // if num_tenbits_to_pack is a multiple of 3, skip the following
 412         movzwl  (%rcx), %eax                                                    // w0
 413         subl    $1, size                                                                // num_tenbits_to_pack--
 414         je              2f                                                                              //
 415         movzwl  2(%rcx), %edx                                                   // w1
 416         sall    $10, %edx                                                               // w1 << 10
 417         orl             %edx, %eax                                                              // w0 | (w1<<10)
 418 2:
 419         subl    $4, byte_count                                                  // fill in a new 4-bytes word
 420         jle             L_budgetExhausted
 421         movl    %eax, (%rdi)                                                    // write the final dest_buf word
 422         addq    $4, %rdi                                                                // dest_buf++
 423
 424 3:      movq    %rdi, %rax                                                              // boundary_tmp
 425         subq    dest_buf, %rax                                                  // boundary_tmp - dest_buf
 426         shrq    $2, %rax                                                                // boundary_tmp - dest_buf in terms of words
 427         movl    %eax, 8(dest_buf)                                               // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
 428         shlq    $2, %rax                                                                // boundary_tmp - dest_buf in terms of bytes
 429
 430 L_done:
 431         // restore registers and return
 432         addq    $(48+64), %rsp
 433         popq    %rbx
 434         popq    %r12
 435         popq    %r13
 436         popq    %r14
 437         popq    %r15
 438         leave
 439         ret
 440
 441     .align  4
 442 L_budgetExhausted:
 443         mov             $-1, %rax
 444         jmp             L_done
 445
 446
 447         .align 4,0x90
 448 L_RECORD_EXACT:
 449         subq    dictionary, %rcx                                        // dict_location - dictionary
 450         sarq    $2, %rcx                                                        // divide by 4 for word offset
 451         movb    $3, -1(next_tag)                                        // *next_tag = 3 for exact
 452         movb    %cl, (next_qp)                                          // *next_qp = word offset (4-bit)
 453         incq    next_qp                                                         // next_qp++
 454         cmpq    next_input_word, checkpoint         // checkpoint time?
 455         jne     L_scan_loop
 456         jmp         CHECKPOINT
 457
 458         .align 4,0x90
 459 L_RECORD_PARTIAL:
 460         movq    %rcx, %rax                                                      // dict_location
 461         movb    $1, -1(next_tag)                                        // *next_tag = 1 for partial matched
 462         subq    dictionary, %rax                                        // dict_location - dictionary
 463         movl    %edx, (%rcx)                                            // *dict_location = input_word;
 464         sarq    $2, %rax                                                        // offset in 32-bit word
 465         movb    %al, (next_qp)                                          // update *next_qp
 466         andl    $1023, %edx                                                     // lower 10 bits
 467         incq    next_qp                                                         // next_qp++
 468         mov             %dx, (next_low_bits)                            // save next_low_bits
 469         addq    $2, next_low_bits                                       // next_low_bits++
 470         cmpq    next_input_word, checkpoint         // checkpoint time?
 471         jne     L_scan_loop
 472
 473 CHECKPOINT:
 474
 475     cmpq        end_of_input, checkpoint            // end of buffer or compression ratio check?
 476     jne     L_check_compression_ratio
 477
 478 L_check_zero_page:
 479                                                 // check if any dictionary misses in page
 480     cmpq    start_next_full_patt, next_full_patt
 481     jne     L_check_single_value_page
 482
 483     cmpq    start_next_qp, next_qp              // check if any partial or exact dictionary matches
 484     jne     L_check_single_value_page
 485
 486     mov     SV_RETURN, %rax                     // Magic return value
 487     jmp     L_done
 488
 489 L_check_single_value_page:
 490
 491     movq    next_full_patt, %rax                // get # dictionary misses
 492     subq    start_next_full_patt, %rax
 493     shrq    $2, %rax
 494
 495     movq    next_qp, %r11                       // get # dictionary hits (exact + partial)
 496     subq    start_next_qp, %r11
 497
 498     movq    next_low_bits, %r13                 // get # dictionary partial hits
 499     subq    start_next_low_bits, %r13
 500     shrq    $1, %r13
 501
 502     movq    tempTagsArray, %r14                 // get the address of the first tag
 503
 504     // Single value page if one of the follwoing is true:
 505     //  partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
 506     //  partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
 507     //
 508     cmpq    $0, %r13                            // were there 0 partial hits?
 509     jne     1f
 510
 511     cmpq    $1023, %r11                         // were there 1023 dictionary hits
 512     jne     1f
 513
 514     cmpq    $1, %rax                            // was there exacly 1 dictionary miss?
 515     jne     1f
 516
 517     cmpb    $2, 0(%r14)                         // was the very 1st tag a miss?
 518     je      L_is_single_value_page
 519
 520 1:
 521     cmpq    $1, %r13                            // was there 1 partial hit?
 522     jne     L_check_mostly_zero
 523
 524     cmpq    $1024, %r11                         // were there 1024 dictionary hits
 525     jne     L_check_mostly_zero
 526
 527     cmpb    $1, 0(%r14)                         // was the very 1st tag a partial?
 528     jne     L_check_mostly_zero
 529
 530 L_is_single_value_page:
 531
 532     mov     SV_RETURN, %rax                     // Magic return value
 533     jmp     L_done
 534
 535 L_check_mostly_zero:
 536                                                 // how much space will the sparse packer take?
 537     addq    %r11, %rax                          // rax += (next_qp - start_next_qp)
 538     movq    $6, %rdx
 539     mulq    %rdx                                // rax *= 6 (i.e. 4 byte word + 2 byte offset)
 540     addq    $4, %rax                            // rax += 4 byte for header
 541     movq    %rax, %r11
 542                                                 // how much space will the defaut packer take?
 543     movq    next_low_bits, %rax
 544     subq    start_next_low_bits, %rax           // get bytes consumed by lower-10 bits
 545     movq    $1365, %rdx
 546     mulq    %rdx
 547     shrq    $11, %rax                           // rax = 2/3*(next_low_bits - start_next_low_bits)
 548     movq    next_full_patt, %rdx
 549     subq    start_next_full_patt, %rdx          // get bytes consumed by dictionary misses
 550     addq    %rdx, %rax                          // rax += (next_full_patt - start_next_full_patt)
 551     movq    next_qp, %rdx
 552     subq    start_next_qp, %rdx
 553     shrq    $1, %rdx                            // get bytes consumed by dictionary hits
 554     addq    %rdx, %rax                          // rax += (next_qp - start_next_qp)/2
 555     addq    $(12+256), %rax                     // rax += bytes taken by the header + tags
 556
 557     cmpq    %r11, %rax                          // is default packer the better option?
 558     jb      L_done_search
 559
 560     cmpl    byte_budget, %r11d                  // can the sparse packer fit into the given budget?
 561     ja      L_budgetExhausted
 562
 563 L_sparse_packer:
 564
 565     movl    MZV_MAGIC, 0(dest_buf)              // header to indicate a sparse packer
 566     addq    $4, dest_buf
 567
 568     movq    $0, %rdx                            // rdx = byte offset in src of non-0 word
 569     movq    start_next_input_word, %r8
 570 1:
 571     movq    0(%r8, %rdx), %rax                  // rax = read dword
 572         testq   %rax, %rax                          // is dword == 0
 573     jne     5f
 574 3:
 575     addq    $8, %rdx                            // 8 more bytes have been processed
 576 4:
 577     cmpq    $4096, %rdx
 578     jne     1b
 579     movq    %r11, %rax                          // store the size of the compressed stream
 580     jmp     L_done
 581
 582 5:
 583     testl   %eax, %eax                          // is lower word == 0
 584     je      6f
 585     movl    %eax, 0(dest_buf)                   // store the non-0 word in the dest buffer
 586     mov     %dx, 4(dest_buf)                    // store the byte index
 587     addq    $6, dest_buf
 588 6:
 589     shrq    $32, %rax                           // get the upper word into position
 590     testl   %eax, %eax                          // is upper word == 0
 591     je      3b
 592     addq    $4, %rdx
 593     movl    %eax, 0(dest_buf)                   // store the word in the dest buffer
 594     mov     %dx, 4(dest_buf)                    // store the byte index
 595     addq    $6, dest_buf
 596     addq    $4, %rdx
 597     jmp     4b
 598
 599 L_check_compression_ratio:
 600
 601     movq    end_of_input, checkpoint            // checkpoint = end of buffer
 602
 603     movq    next_low_bits, %rax
 604     subq    start_next_low_bits, %rax           // get bytes consumed by lower-10 bits
 605     movq    $1365, %rdx
 606     mulq    %rdx
 607     shrq    $11, %rax                           // rax = 2/3*(next_low_bits - start_next_low_bits)
 608
 609     movq    next_full_patt, %rdx
 610     subq    start_next_full_patt, %rdx          // get bytes consumed by dictionary misses
 611     addq    %rdx, %rax                          // rax += (next_full_patt - start_next_full_patt)
 612
 613     movq    next_qp, %rdx
 614     subq    start_next_qp, %rdx
 615     shrq    $1, %rdx
 616     addq    %rdx, %rax                          // rax += (next_qp - start_next_qp)/2
 617
 618     addq    $CHKPT_TAG_BYTES, %rax              // rax += bytes taken by the tags
 619     cmpq    $CHKPT_SHRUNK_BYTES, %rax
 620     ja      L_budgetExhausted                   // compressed size exceeds budget
 621     jmp     L_scan_loop
 622