osfmk/arm64/WKdmCompress_4k.s

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  This file contains arm64 hand optimized implementation of WKdm memory page compressor.
  31
  32         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
  33
  34         input :
  35                 src_buf : address of input page (length = 1024 words)
  36                 dest_buf : address of output buffer (may not be 16-byte aligned)
  37                 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
  38                 bytes_budget : a given byte target in compression
  39
  40         output :
  41
  42                 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
  43                 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
  44                 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
  45                 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
  46                 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
  47
  48  The 4th argument bytes_budget is the target compress budget in bytes.
  49  Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
  50  Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
  51
  52  WKdm Compression algorithm is briefly stated as follows:
  53
  54         There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
  55         For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
  56                 a. k = (x>>10)&255;                                             // 8-bit hash table index
  57                 b. dict_index = hashTable[k];                   // 4-bit dictionary index, hashTable[] is fixed
  58                 c. dict_word = dictionary[dict_index];  // 32-bit dictionary word, dictionary[] is dynamically updated
  59
  60         Each input word x is classified/tagged into 4 classes :
  61                 0 : x = 0
  62                 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
  63                 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
  64                 3 : (x == dict_word), the exact input word is in the dictionary
  65
  66         For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
  67                 0 : 2-bits tag (32->2 compression)
  68                 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
  69                 2 : 2-bits tag + 32-bits new word (32->34 expansion)
  70                 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
  71
  72         It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
  73
  74         the output bit stream (*dest_buf) consists of
  75                 a. 12 bytes header
  76                 b. 256 bytes for 1024 packed tags
  77                 c. (varying number of) words for new words not matched to dictionary word.
  78                 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
  79                 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
  80
  81         the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
  82         Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
  83
  84
  85         The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
  86
  87         The temp buffers are
  88
  89                 uint8_t         tempTagsArray[1024];                    // temporary saving for tags before final packing
  90                 uint8_t         tempQPosArray[1024];                    // temporary saving for dict_indices before final packing
  91                 uint16_t        tempLowBitsArray[1024];                 // temporary saving for partially matched lower 10 bits before final packing
  92
  93         Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
  94         the destination buffer.
  95
  96                 uint32_t        *new_word = dest_buf+3+64;              // 3 words for header, 64 words for tags, new words come right after the tags.
  97
  98         Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass.
  99
 100         byte_count -= 12 + 256;         // bit budget minus header and tags
 101
 102         whenever an input word is classified as class
 103
 104                 2 : byte_count -= 4;
 105
 106         the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0).
 107
 108         without showing the bit budget management, the pseudo code is given as follows:
 109
 110         uint8_t         *tags=tempTagsArray;
 111         uint8_t         *dict=tempQPosArray;
 112         uint8_t         *partial=tempLowBitsArray;
 113
 114         for (i=0;i<1024;i++) {
 115                         x = *src_buf++;
 116                         if (x == 0) {           // zero, 2-bits tag
 117                                         *tags++ = 0;
 118                         } else {
 119
 120                                 // find dict_index and dict_word from x
 121                                 k = (x>>10)&255;
 122                                 dict_index = hashTable[k];
 123                                 dict_word = dictionary[dict_index];
 124
 125                                 if (dict_word == x) { // exactly match
 126                                         // 2-bits tag + 4-bits table index
 127                                         *tags++ = 3;
 128                                         *dict++ = dict_index;
 129                                 } else if (((x^dict_word)>>10)==0) {    // 22 higher bits matched
 130                                         // 2-bits tag + 4-bits table index + 10-bits lower partial
 131                                         *tags++ = 1;
 132                     *dict++ = dict_index;
 133                                         *partial++ = x &0x3ff;
 134                                         dictionary[dict_index] = x;
 135                                 } else {        // not matched
 136                                         // 2-bits tag + 32-bits new word
 137                                         *tags++ = 2;
 138                                         *new_word++ = x;
 139                                         dictionary[dict_index] = x;
 140                                 }
 141                         }
 142         }
 143
 144         after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
 145
 146                 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
 147                 2. dictionary indices (4-bits each) are packed into are right after the new words section
 148                 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
 149
 150         cclee, 11/9/12
 151
 152     Added zero page, single value page, sparse page, early abort optimizations
 153     rsrini, 09/14/14
 154 */
 155
 156 #ifndef PAGES_SIZE_IN_KBYTES
 157 #define PAGES_SIZE_IN_KBYTES    4
 158 #endif
 159
 160 #if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16))
 161 #error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
 162 #endif
 163
 164
 165         .text
 166         .align 4
 167
 168 /*
 169         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
 170 */
 171
 172 .globl _WKdm_compress_4k
 173 _WKdm_compress_4k:
 174
 175 /*
 176          -------------------------       symbolizing register use          -----------------------------------
 177 */
 178         #define src_buf                         x0
 179         #define next_input_word         x0
 180         #define dest_buf                        x1
 181         #define scratch                         x2
 182         #define byte_count                      x3
 183         #define next_tag                        x4
 184         #define tempTagsArray           x2              // scratch
 185         #define dictionary                      x5
 186         #define remaining                       x6
 187         #define next_full_patt          x7
 188         #define dict_location           x8
 189         #define wdict_location          w8
 190         #define next_qp                         x9
 191         #define hashTable                       x10
 192         #define tempQPosArray           x11
 193         #define next_low_bits           x12
 194
 195 /*
 196         this arm64 assembly code is ported from x86_64 assembly code,
 197         therefore need such symbolization to quickly reuse the x86_64 assembly code
 198         for these intermediate/temporary register use
 199 */
 200         #define rax                                     x13
 201         #define eax                                     w13
 202         #define rcx                                     x14
 203         #define ecx                                     w14
 204         #define rdx                                     x15
 205         #define edx                                     w15
 206         #define rdi                                     x0                      /* after some point, x0/rdi becomes free other usage */
 207
 208
 209 /*
 210                 -------------------------    scratch  memory  --------------------------------------
 211
 212         need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray)
 213         total 6208 bytes
 214         [sp,#0]         : dictionary
 215         [scratch,#0]    : tempTagsArray
 216         [scratch,#1024] : tempQPosArray
 217         [scratch,#2048] : tempLowBitsArray
 218 */
 219
 220 #define scale   (PAGES_SIZE_IN_KBYTES/4)
 221
 222 #define SV_RETURN           0                       // return value when SV, ZV page is found
 223 #define MZV_MAGIC           17185                   // magic value used to identify MZV page encoding
 224 #define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
 225 #define CHKPT_WORDS         (CHKPT_BYTES/4)         // checkpoint bytes in words
 226 #define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
 227 #define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
 228                                                     //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
 229 #if CHKPT_BYTES > 4096
 230     #error CHKPT_BYTES must be <= 4096
 231 #endif
 232 #if CHKPT_BYTES < 4
 233     #error CHKPT_BYTES must be >= 4
 234 #endif
 235
 236 #if KERNEL
 237     sub     sp, sp, #64
 238     st1.4s  {v0,v1,v2,v3},[sp]
 239 #endif
 240
 241     sub     sp, sp, #64                                 // allocate for dictionary
 242         mov             dictionary, sp                          // use x5 to point to sp, so we can use sub xd, xn, sp
 243
 244     sub     sp, sp, #64                 // allocate space for saving callee-saved registers
 245         mov             x15, sp
 246     stp     x20, x21, [x15, #0]         // save x20, x21
 247     stp     x22, x23, [x15, #16]        // save x22, x23
 248     stp     x24, x25, [x15, #32]        // save x24, x25
 249     stp     x26, x27, [x15, #48]        // save x26, x27
 250
 251 /*
 252                 -------  entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY -------------------
 253 */
 254
 255                                             // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
 256                                             // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
 257         mov             next_tag, tempTagsArray                 // &tempTagsArray[0]
 258         add             next_qp, scratch, #(1024*scale) // next_qp
 259         mov             remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint
 260         add             next_full_patt, dest_buf, #(12+256*scale)       // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
 261         sub             byte_count, byte_count, #(12+256*scale) // bit_count - header - tags
 262         add             next_low_bits, scratch, #(2048*scale)   // &tempLowBitsArray[0]
 263         stp             xzr, xzr, [dictionary, #0]              // initialize dictionary
 264         adrp    hashTable, _hashLookupTable@GOTPAGE
 265         stp             xzr, xzr, [dictionary, #16]             // initialize dictionary
 266         stp             xzr, xzr, [dictionary, #32]             // initialize dictionary
 267     ldr         hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF]
 268         stp             xzr, xzr, [dictionary, #48]             // initialize dictionary
 269
 270 #define EARLYCHECK              0
 271 #define NORMAL                  1
 272
 273 #define mode                    w20
 274 #define start_next_full_patt    x21
 275 #define start_next_input_word   x22
 276 #define start_next_low_bits     x23
 277 #define r11                     x24
 278 #define r13                     x25
 279 #define byte_budget             x26
 280 #define start_next_qp           tempQPosArray
 281
 282         add             tempQPosArray, scratch, #(1024*scale)       // &tempQPosArray[0]
 283     mov     mode, EARLYCHECK                            // indicate we are yet to evaluate the early aborts
 284     mov     start_next_full_patt, next_full_patt        // remember the start of next_full_patt
 285     mov     start_next_input_word, next_input_word      // remember the start of next_input_word
 286     mov     start_next_low_bits, next_low_bits          // remember the start of next_low_bit
 287     add     byte_budget, byte_count, #(12+256*scale)    // remember the byte budget
 288
 289         b               L_loop
 290
 291         .align  4, 0x90
 292
 293         /* we've just detected a zero input word in edx */
 294 L_RECORD_ZERO:
 295         strb    edx, [next_tag], #1                             // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0
 296         subs    remaining, remaining, #1                // remaing--;
 297         b.le    CHECKPOINT                                      // if remaining = 0, break
 298
 299         /* --------------    scan/tag pass loop -------------------------  */
 300 L_loop:
 301
 302         /* load new input word to edx */
 303         ldr             edx, [next_input_word], #4
 304         cbz             edx, L_RECORD_ZERO                                                      // if (input_word==0) RECORD_ZERO
 305
 306         /*
 307                 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
 308         */
 309         ubfm    eax, edx, #10, #17
 310         ldrb    wdict_location, [hashTable, rax]                // HASH_TO_DICT_BYTE_OFFSET(input_word)
 311         ldr             eax, [dictionary, dict_location]                // dict_word = *dict_location;
 312
 313         /* detect whether we match input to its corresponding dictionary word */
 314         eor             eax, eax, edx                                                   // dict_word vs input_word
 315         cbz             eax, L_RECORD_EXACT                                             // if identical, RECORD_EXACT
 316         lsr             eax, eax, #10                                                   // HIGH_BITS(dict_word^input_word)
 317         cbz             eax, L_RECORD_PARTIAL                                   // if identical, RECORD_PARTIAL
 318
 319 L_RECORD_MISS:
 320 /*
 321         if we are here, the input word can not be derived from the dictionary,
 322         we write the input word as a new word,
 323         and update the dictionary with this new word
 324 */
 325         subs    byte_count, byte_count, #4                              // byte_count -= 4
 326         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 327         str             edx, [next_full_patt], #4                               // *next_full_patt++ = input_word;
 328         mov             eax, #2                                                                 // tag for MISS
 329         subs    remaining, remaining, #1                                // remaing--;
 330         str             edx, [dictionary, dict_location]                // *dict_location = input_word
 331         strb    eax, [next_tag], #1                                             // *next_tag++ = 2 for miss
 332         b.gt    L_loop                                                                  // // if remaining > 0, repeat
 333     b       CHECKPOINT
 334
 335 L_done_search:
 336
 337         // SET_QPOS_AREA_START(dest_buf,next_full_patt);
 338         /* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */
 339
 340         sub             rax, next_full_patt, dest_buf                   // next_full_patt - dest_buf
 341         lsr             eax, eax, #2                                                    // offset in 4-bytes
 342         str             eax, [dest_buf]                                                 // dest_buf[0] = next_full_patt - dest_buf
 343
 344         /* --------------------------     packing 1024 tags into 256 bytes ----------------------------------------*/
 345         // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
 346
 347         add             rdi, dest_buf, #12                                              // dest_buf
 348         mov             rcx, tempTagsArray                                              // &tempTagsArray[0]
 349
 350 L_pack_2bits:
 351         ld1.2s  {v0,v1,v2,v3},[rcx],#32
 352
 353         shl.2d  v1,v1,#4
 354         shl.2d  v3,v3,#4
 355
 356         orr.8b  v0, v0, v1
 357         orr.8b  v2, v2, v3
 358
 359         ushr.2d v1, v0, #30
 360         ushr.2d v3, v2, #30
 361
 362         orr.8b  v0, v0, v1
 363         orr.8b  v2, v2, v3
 364
 365         zip1.2s v0, v0, v2
 366         st1.2s  {v0},[rdi],#8
 367         cmp             next_tag, rcx
 368         b.hi    L_pack_2bits
 369
 370         /* ---------------------------------      packing 4-bits dict indices into dest_buf ----------------------------------   */
 371
 372         /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
 373         sub             rax, next_qp, tempQPosArray                             // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
 374         add             eax, eax, #7                                                    // num_bytes_to_pack+7
 375         lsr             eax, eax, #3                                                    // num_packed_words = (num_bytes_to_pack + 7) >> 3
 376         add             rcx, tempQPosArray, rax, lsl #3                 // endQPosArray = tempQPosArray + 2*num_source_words
 377         lsl             rax, rax, #2
 378         subs    byte_count, byte_count, rax
 379         b.lt    L_budgetExhausted
 380
 381         cmp             rcx, next_qp                                                    // endQPosArray vs next_qp
 382         b.ls    2f                                                                              // if (next_qp >= endQPosArray) skip the following zero paddings
 383         sub             rax, rcx, next_qp
 384         mov             edx, #0
 385         tst             eax, #4
 386         b.eq    1f
 387         str             edx, [next_qp], #4
 388 1:      tst             eax, #2
 389         b.eq    1f
 390         strh    edx, [next_qp], #2
 391 1:      tst             eax, #1
 392         b.eq    2f
 393         strb    edx, [next_qp], #1
 394 2:
 395         mov             rdi, next_full_patt                                             // next_full_patt
 396         cmp             rcx, tempQPosArray                                              // endQPosArray vs tempQPosArray
 397         ldr             eax, [dest_buf]
 398         b.ls    L20                                                                             // if (endQPosArray <= tempQPosArray) skip the following
 399         mov             rdx, tempQPosArray                                              // tempQPosArray
 400
 401         /* packing 4-bits dict indices into dest_buf */
 402 L_pack_4bits:
 403         ldr             rax, [rdx], #8                                                  // src_next[1]:src_next[0]
 404         orr             rax, rax, rax, lsr #28                                  // eax = src_next[0] | (src_next[1] << 4)
 405         cmp             rcx, rdx                                                                // source_end vs src_next
 406         str             eax, [rdi], #4                                                  // *dest_next++ = temp;
 407         b.hi    L_pack_4bits                                                    // while (src_next < source_end) repeat the loop
 408
 409         // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
 410         sub             rax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 411         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in words
 412 L20:
 413         str             eax, [dest_buf,#4]                                              // dest_buf[1] = boundary_tmp - dest_buf
 414
 415
 416
 417         /*  --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[]   ----------------------------------------- */
 418
 419         add             rcx, scratch, #(2048*scale)                             // tempLowBitsArray
 420     sub         rdx, next_low_bits, rcx                                 // next_low_bits - tempLowBitsArray (in bytes)
 421         lsr             rdx, rdx, #1                                                    // num_tenbits_to_pack (in half-words)
 422         subs    edx, edx, #3                                                    // pre-decrement num_tenbits_to_pack by 3
 423         b.lt    1f                                                                              // if num_tenbits_to_pack < 3, skip the following loop
 424 0:
 425         subs    byte_count, byte_count, #4                              // byte_count -= 4
 426         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 427         subs    edx, edx, #3                                                    // num_tenbits_to_pack-=3
 428         ldr             rax, [rcx], #6
 429         bfm             rax, rax, #58, #9                                               // pack 1st toward 2nd
 430         bfm             rax, rax, #58, #25                                              // pack 1st/2nd toward 3rd
 431         lsr             rax, rax, #12
 432         str             eax, [rdi], #4                                                  // pack w0,w1,w2 into 1 dest_buf word
 433         b.ge    0b                                                                              // if no less than 3 elements, back to loop head
 434
 435 1:      adds    edx, edx, #3                                                    // post-increment num_tenbits_to_pack by 3
 436         b.eq    3f                                                                              // if num_tenbits_to_pack is a multiple of 3, skip the following
 437         subs    byte_count, byte_count, #4                              // byte_count -= 4
 438         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 439         ldrh    eax,[rcx]                                                               // w0
 440         subs    edx, edx, #1                                                    // num_tenbits_to_pack--
 441         b.eq    2f                                                                              //
 442         ldrh    edx, [rcx, #2]                                                  // w1
 443         orr             eax, eax, edx, lsl #10                                  // w0 | (w1<<10)
 444
 445 2:      str             eax, [rdi], #4                                                  // write the final dest_buf word
 446
 447 3:      sub             rax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 448         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in terms of words
 449         str             eax, [dest_buf, #8]                                             // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
 450         lsl             w0, eax, #2                                                             // boundary_tmp - dest_buf in terms of bytes
 451
 452 L_done:
 453
 454         // restore registers and return
 455         mov             x15, sp
 456     ldp     x20, x21, [x15, #0]             // restore x20, x21
 457     ldp     x22, x23, [x15, #16]            // restore x22, x23
 458     ldp     x24, x25, [x15, #32]            // restore x24, x25
 459     ldp     x26, x27, [x15, #48]            // restore x26, x27
 460     add     sp, sp, #128                                        // deallocate for dictionary + saved register space
 461
 462 #if KERNEL
 463         ld1.4s  {v0,v1,v2,v3},[sp],#64
 464 #endif
 465         ret             lr
 466
 467     .align  4
 468 L_budgetExhausted:
 469     mov     x0, #-1
 470     b       L_done
 471
 472
 473         .align 4,0x90
 474 L_RECORD_EXACT:
 475 /*
 476                 we have an exact match of the input word to its corresponding dictionary word
 477                 write tag/dict_index to the temorary buffers
 478 */
 479         mov             eax, #3
 480         lsr             w14, wdict_location, #2                         // divide by 4 for word offset
 481         subs    remaining, remaining, #1                        // remaing--;
 482         strb    eax, [next_tag], #1                                     // *next_tag++ = 3 for exact
 483         strb    w14, [next_qp], #1                                      // *next_qp = word offset (4-bit)
 484         b.gt    L_loop
 485         b               CHECKPOINT                                              // if remaining = 0, break
 486
 487         .align 4,0x90
 488 L_RECORD_PARTIAL:
 489 /*
 490                 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
 491                 write tag/dict_index/low 10 bits to the temorary buffers
 492 */
 493         mov             ecx, #1
 494         strb    ecx, [next_tag], #1                                     // *next_tag++ = 1 for partial matched
 495         str             edx, [dictionary, dict_location]        // *dict_location = input_word;
 496         subs    remaining, remaining, #1                        // remaing--;
 497         lsr             eax, wdict_location, #2                         // offset in 32-bit word
 498         and             edx, edx, #1023                                         // lower 10 bits
 499         strb    eax, [next_qp], #1                                      // update *next_qp++
 500         strh    edx, [next_low_bits], #2                        // save next_low_bits++
 501         b.gt    L_loop
 502
 503 CHECKPOINT:
 504
 505     cbz     mode, L_check_compression_ratio             // if this this an early abort check..
 506
 507 L_check_zero_page:
 508
 509     cmp     start_next_full_patt, next_full_patt        // check if any dictionary misses in page
 510     b.ne    L_check_single_value_page
 511
 512     cmp     start_next_qp, next_qp                      // check if any partial or exact dictionary matches
 513     b.ne    L_check_single_value_page
 514
 515     mov     x0, #SV_RETURN                              // Magic return value
 516     b       L_done
 517
 518 L_check_single_value_page:
 519
 520     sub     rax, next_full_patt, start_next_full_patt   // get # dictionary misses
 521     lsr     rax, rax, #2
 522
 523     sub     r11, next_qp, start_next_qp                 // get # dictionary hits (exact + partial)
 524
 525     sub     r13, next_low_bits, start_next_low_bits     // get # dictionary partial hits
 526     lsr     r13, r13, #1
 527
 528     // Single value page if one of the follwoing is true:
 529     //  partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss)
 530     //  partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial)
 531     //
 532     cbnz    r13, 1f                                     // were there 0 partial hits?
 533
 534     cmp     r11, #(256*PAGES_SIZE_IN_KBYTES - 1)        // were there 1023 dictionary hits
 535     b.ne    1f
 536
 537     cmp     rax, #1                                     // was there exacly 1 dictionary miss?
 538     b.ne    1f
 539
 540     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 541     cmp     edx, #2                                     // was the very 1st tag a miss?
 542     b.eq    L_is_single_value_page
 543
 544 1:
 545     cmp     r13, #1                                     // was there 1 partial hit?
 546     b.ne    L_check_mostly_zero
 547
 548     cmp     r11, #(256*PAGES_SIZE_IN_KBYTES)           // were there 1024 dictionary hits
 549     b.ne    L_check_mostly_zero
 550
 551     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 552     cmp     edx, #1                                     // was the very 1st tag a partial?
 553     b.ne    L_check_mostly_zero
 554
 555 L_is_single_value_page:
 556
 557     mov     x0, #SV_RETURN                              // Magic return value
 558     b       L_done
 559
 560 L_check_mostly_zero:
 561                                                         // how much space will the sparse packer take?
 562     add     rax, rax, r11                               // rax += (next_qp - start_next_qp)
 563     mov     rdx, #6
 564     mov     rcx, #4
 565     madd    r11, rax, rdx, rcx                          // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
 566
 567     sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
 568     mov     rdx, #1365
 569     mul     rax, rax, rdx
 570
 571     sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
 572     add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 573
 574     sub     rdx, next_qp, start_next_qp
 575     add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
 576     add     rax, rax, #(12+256*scale)                   // rax += bytes taken by the header + tags
 577
 578     cmp     rax, r11                                    // is the default packer the better option?
 579     b.lt    L_done_search
 580
 581     cmp     r11, byte_budget                            // can the sparse packer fit into the given budget?
 582     b.gt    L_budgetExhausted
 583
 584 L_sparse_packer:
 585     mov     edx, #MZV_MAGIC
 586     str     edx, [dest_buf], #4                         // header to indicate a sparse packer
 587
 588     mov     rdx, #0                                     // rdx = byte offset in src of non-0 word
 589 1:
 590     ldr     rax, [start_next_input_word, rdx]           // rax = read dword
 591     cbnz    rax, 5f                                     // is dword != 0
 592 3:
 593     add     rdx, rdx, #8                                // 8 more bytes have been processed
 594 4:
 595     cmp     rdx, #(4096*scale)                          // has the entire page been processed
 596     b.ne    1b
 597     mov     x0, r11                                     // store the size of the compressed stream
 598     b       L_done
 599
 600 5:
 601     cbz     eax, 6f                                     // is lower word == 0
 602     str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
 603     strh    edx, [dest_buf], #2                         // store the byte index
 604 6:
 605     lsr     rax, rax, 32                                // get the upper word into position
 606     cbz     eax, 3b                                     // is dword == 0
 607     add     rdx, rdx, #4
 608     str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
 609     strh    edx, [dest_buf], #2                         // store the byte index
 610     add     rdx, rdx, #4
 611     b       4b
 612
 613 L_check_compression_ratio:
 614
 615     mov     mode, NORMAL
 616         mov             remaining, #((1024 - CHKPT_WORDS)*scale)    // remaining input words to process
 617     cbz     remaining, CHECKPOINT                       // if there are no remaining words to process
 618
 619     sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
 620     mov     rdx, #1365
 621     mul     rax, rax, rdx
 622
 623     sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
 624     add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 625
 626     sub     rdx, next_qp, start_next_qp
 627     add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
 628     subs    rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale)
 629                                                         // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES
 630
 631     b.gt    L_budgetExhausted                           // if rax is > 0, we need to early abort
 632     b       L_loop                                      // we are done