osfmk/arm64/WKdmCompress_16k.s

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  This file contains arm64 hand optimized implementation of WKdm memory page compressor.
  31
  32         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
  33
  34         input :
  35                 src_buf : address of input page (length = 1024 words)
  36                 dest_buf : address of output buffer (may not be 16-byte aligned)
  37                 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
  38                 bytes_budget : a given byte target in compression
  39
  40         output :
  41
  42                 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
  43                 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
  44                 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
  45                 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
  46                 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
  47
  48  The 4th argument bytes_budget is the target compress budget in bytes.
  49  Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
  50  Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
  51
  52  WKdm Compression algorithm is briefly stated as follows:
  53
  54         There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
  55         For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
  56                 a. k = (x>>10)&255;                                             // 8-bit hash table index
  57                 b. dict_index = hashTable[k];                   // 4-bit dictionary index, hashTable[] is fixed
  58                 c. dict_word = dictionary[dict_index];  // 32-bit dictionary word, dictionary[] is dynamically updated
  59
  60         Each input word x is classified/tagged into 4 classes :
  61                 0 : x = 0
  62                 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
  63                 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
  64                 3 : (x == dict_word), the exact input word is in the dictionary
  65
  66         For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
  67                 0 : 2-bits tag (32->2 compression)
  68                 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
  69                 2 : 2-bits tag + 32-bits new word (32->34 expansion)
  70                 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
  71
  72         It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
  73
  74         the output bit stream (*dest_buf) consists of
  75                 a. 12 bytes header
  76                 b. 256 bytes for 1024 packed tags
  77                 c. (varying number of) words for new words not matched to dictionary word.
  78                 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
  79                 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
  80
  81         the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
  82         Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
  83
  84
  85         The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
  86
  87         The temp buffers are
  88
  89                 uint8_t         tempTagsArray[1024];                    // temporary saving for tags before final packing
  90                 uint8_t         tempQPosArray[1024];                    // temporary saving for dict_indices before final packing
  91                 uint16_t        tempLowBitsArray[1024];                 // temporary saving for partially matched lower 10 bits before final packing
  92
  93         Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
  94         the destination buffer.
  95
  96                 uint32_t        *new_word = dest_buf+3+64;              // 3 words for header, 64 words for tags, new words come right after the tags.
  97
  98         Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass.
  99
 100         byte_count -= 12 + 256;         // bit budget minus header and tags
 101
 102         whenever an input word is classified as class
 103
 104                 2 : byte_count -= 4;
 105
 106         the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0).
 107
 108         without showing the bit budget management, the pseudo code is given as follows:
 109
 110         uint8_t         *tags=tempTagsArray;
 111         uint8_t         *dict=tempQPosArray;
 112         uint8_t         *partial=tempLowBitsArray;
 113
 114         for (i=0;i<1024;i++) {
 115                         x = *src_buf++;
 116                         if (x == 0) {           // zero, 2-bits tag
 117                                         *tags++ = 0;
 118                         } else {
 119
 120                                 // find dict_index and dict_word from x
 121                                 k = (x>>10)&255;
 122                                 dict_index = hashTable[k];
 123                                 dict_word = dictionary[dict_index];
 124
 125                                 if (dict_word == x) { // exactly match
 126                                         // 2-bits tag + 4-bits table index
 127                                         *tags++ = 3;
 128                                         *dict++ = dict_index;
 129                                 } else if (((x^dict_word)>>10)==0) {    // 22 higher bits matched
 130                                         // 2-bits tag + 4-bits table index + 10-bits lower partial
 131                                         *tags++ = 1;
 132                     *dict++ = dict_index;
 133                                         *partial++ = x &0x3ff;
 134                                         dictionary[dict_index] = x;
 135                                 } else {        // not matched
 136                                         // 2-bits tag + 32-bits new word
 137                                         *tags++ = 2;
 138                                         *new_word++ = x;
 139                                         dictionary[dict_index] = x;
 140                                 }
 141                         }
 142         }
 143
 144         after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
 145
 146                 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
 147                 2. dictionary indices (4-bits each) are packed into are right after the new words section
 148                 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
 149
 150         cclee, 11/9/12
 151
 152     Added zero page, single value page, sparse page, early abort optimizations
 153     rsrini, 09/14/14
 154 */
 155
 156 #define PAGES_SIZE_IN_KBYTES    16
 157
 158 #ifndef PAGES_SIZE_IN_KBYTES
 159 #define PAGES_SIZE_IN_KBYTES    4
 160 #endif
 161
 162 #if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16))
 163 #error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
 164 #endif
 165
 166
 167         .text
 168         .align 4
 169
 170 /*
 171         int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
 172 */
 173
 174 .globl _WKdm_compress_16k
 175 _WKdm_compress_16k:
 176
 177 /*
 178          -------------------------       symbolizing register use          -----------------------------------
 179 */
 180         #define src_buf                         x0
 181         #define next_input_word         x0
 182         #define dest_buf                        x1
 183         #define scratch                         x2
 184         #define byte_count                      x3
 185         #define next_tag                        x4
 186         #define tempTagsArray           x2              // scratch
 187         #define dictionary                      x5
 188         #define remaining                       x6
 189         #define next_full_patt          x7
 190         #define dict_location           x8
 191         #define wdict_location          w8
 192         #define next_qp                         x9
 193         #define hashTable                       x10
 194         #define tempQPosArray           x11
 195         #define next_low_bits           x12
 196
 197 /*
 198         this arm64 assembly code is ported from x86_64 assembly code,
 199         therefore need such symbolization to quickly reuse the x86_64 assembly code
 200         for these intermediate/temporary register use
 201 */
 202         #define rax                                     x13
 203         #define eax                                     w13
 204         #define rcx                                     x14
 205         #define ecx                                     w14
 206         #define rdx                                     x15
 207         #define edx                                     w15
 208         #define rdi                                     x0                      /* after some point, x0/rdi becomes free other usage */
 209
 210
 211 /*
 212                 -------------------------    scratch  memory  --------------------------------------
 213
 214         need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray)
 215         total 6208 bytes
 216         [sp,#0]         : dictionary
 217         [scratch,#0]    : tempTagsArray
 218         [scratch,#1024] : tempQPosArray
 219         [scratch,#2048] : tempLowBitsArray
 220 */
 221
 222 #define scale   (PAGES_SIZE_IN_KBYTES/4)
 223
 224 #define SV_RETURN           0                       // return value when SV, ZV page is found
 225 #define MZV_MAGIC           17185                   // magic value used to identify MZV page encoding
 226 #define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
 227 #define CHKPT_WORDS         (CHKPT_BYTES/4)         // checkpoint bytes in words
 228 #define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
 229 #define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
 230                                                     //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
 231 #if CHKPT_BYTES > 4096
 232     #error CHKPT_BYTES must be <= 4096
 233 #endif
 234 #if CHKPT_BYTES < 4
 235     #error CHKPT_BYTES must be >= 4
 236 #endif
 237
 238 #if KERNEL
 239     sub     sp, sp, #64
 240     st1.4s  {v0,v1,v2,v3},[sp]
 241 #endif
 242
 243     sub     sp, sp, #64                                 // allocate for dictionary
 244         mov             dictionary, sp                          // use x5 to point to sp, so we can use sub xd, xn, sp
 245
 246     sub     sp, sp, #64                 // allocate space for saving callee-saved registers
 247         mov             x15, sp
 248     stp     x20, x21, [x15, #0]         // save x20, x21
 249     stp     x22, x23, [x15, #16]        // save x22, x23
 250     stp     x24, x25, [x15, #32]        // save x24, x25
 251     stp     x26, x27, [x15, #48]        // save x26, x27
 252
 253 /*
 254                 -------  entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY -------------------
 255 */
 256
 257                                             // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
 258                                             // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
 259         mov             next_tag, tempTagsArray                 // &tempTagsArray[0]
 260         add             next_qp, scratch, #(1024*scale) // next_qp
 261         mov             remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint
 262         add             next_full_patt, dest_buf, #(12+256*scale)       // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
 263         sub             byte_count, byte_count, #(12+256*scale) // bit_count - header - tags
 264         add             next_low_bits, scratch, #(2048*scale)   // &tempLowBitsArray[0]
 265         stp             xzr, xzr, [dictionary, #0]              // initialize dictionary
 266         adrp    hashTable, _hashLookupTable@GOTPAGE
 267         stp             xzr, xzr, [dictionary, #16]             // initialize dictionary
 268         stp             xzr, xzr, [dictionary, #32]             // initialize dictionary
 269     ldr         hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF]
 270         stp             xzr, xzr, [dictionary, #48]             // initialize dictionary
 271
 272 #define EARLYCHECK              0
 273 #define NORMAL                  1
 274
 275 #define mode                    w20
 276 #define start_next_full_patt    x21
 277 #define start_next_input_word   x22
 278 #define start_next_low_bits     x23
 279 #define r11                     x24
 280 #define r13                     x25
 281 #define byte_budget             x26
 282 #define start_next_qp           tempQPosArray
 283
 284         add             tempQPosArray, scratch, #(1024*scale)       // &tempQPosArray[0]
 285     mov     mode, EARLYCHECK                            // indicate we are yet to evaluate the early aborts
 286     mov     start_next_full_patt, next_full_patt        // remember the start of next_full_patt
 287     mov     start_next_input_word, next_input_word      // remember the start of next_input_word
 288     mov     start_next_low_bits, next_low_bits          // remember the start of next_low_bit
 289     add     byte_budget, byte_count, #(12+256*scale)    // remember the byte budget
 290
 291         b               L_loop
 292
 293         .align  4, 0x90
 294
 295         /* we've just detected a zero input word in edx */
 296 L_RECORD_ZERO:
 297         strb    edx, [next_tag], #1                             // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0
 298         subs    remaining, remaining, #1                // remaing--;
 299         b.le    CHECKPOINT                                      // if remaining = 0, break
 300
 301         /* --------------    scan/tag pass loop -------------------------  */
 302 L_loop:
 303
 304         /* load new input word to edx */
 305         ldr             edx, [next_input_word], #4
 306         cbz             edx, L_RECORD_ZERO                                                      // if (input_word==0) RECORD_ZERO
 307
 308         /*
 309                 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
 310         */
 311         ubfm    eax, edx, #10, #17
 312         ldrb    wdict_location, [hashTable, rax]                // HASH_TO_DICT_BYTE_OFFSET(input_word)
 313         ldr             eax, [dictionary, dict_location]                // dict_word = *dict_location;
 314
 315         /* detect whether we match input to its corresponding dictionary word */
 316         eor             eax, eax, edx                                                   // dict_word vs input_word
 317         cbz             eax, L_RECORD_EXACT                                             // if identical, RECORD_EXACT
 318         lsr             eax, eax, #10                                                   // HIGH_BITS(dict_word^input_word)
 319         cbz             eax, L_RECORD_PARTIAL                                   // if identical, RECORD_PARTIAL
 320
 321 L_RECORD_MISS:
 322 /*
 323         if we are here, the input word can not be derived from the dictionary,
 324         we write the input word as a new word,
 325         and update the dictionary with this new word
 326 */
 327         subs    byte_count, byte_count, #4                              // byte_count -= 4
 328         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 329         str             edx, [next_full_patt], #4                               // *next_full_patt++ = input_word;
 330         mov             eax, #2                                                                 // tag for MISS
 331         subs    remaining, remaining, #1                                // remaing--;
 332         str             edx, [dictionary, dict_location]                // *dict_location = input_word
 333         strb    eax, [next_tag], #1                                             // *next_tag++ = 2 for miss
 334         b.gt    L_loop                                                                  // // if remaining > 0, repeat
 335     b       CHECKPOINT
 336
 337 L_done_search:
 338
 339         // SET_QPOS_AREA_START(dest_buf,next_full_patt);
 340         /* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */
 341
 342         sub             rax, next_full_patt, dest_buf                   // next_full_patt - dest_buf
 343         lsr             eax, eax, #2                                                    // offset in 4-bytes
 344         str             eax, [dest_buf]                                                 // dest_buf[0] = next_full_patt - dest_buf
 345
 346         /* --------------------------     packing 1024 tags into 256 bytes ----------------------------------------*/
 347         // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
 348
 349         add             rdi, dest_buf, #12                                              // dest_buf
 350         mov             rcx, tempTagsArray                                              // &tempTagsArray[0]
 351
 352 L_pack_2bits:
 353         ld1.2s  {v0,v1,v2,v3},[rcx],#32
 354
 355         shl.2d  v1,v1,#4
 356         shl.2d  v3,v3,#4
 357
 358         orr.8b  v0, v0, v1
 359         orr.8b  v2, v2, v3
 360
 361         ushr.2d v1, v0, #30
 362         ushr.2d v3, v2, #30
 363
 364         orr.8b  v0, v0, v1
 365         orr.8b  v2, v2, v3
 366
 367         zip1.2s v0, v0, v2
 368         st1.2s  {v0},[rdi],#8
 369         cmp             next_tag, rcx
 370         b.hi    L_pack_2bits
 371
 372         /* ---------------------------------      packing 4-bits dict indices into dest_buf ----------------------------------   */
 373
 374         /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
 375         sub             rax, next_qp, tempQPosArray                             // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
 376         add             eax, eax, #7                                                    // num_bytes_to_pack+7
 377         lsr             eax, eax, #3                                                    // num_packed_words = (num_bytes_to_pack + 7) >> 3
 378         add             rcx, tempQPosArray, rax, lsl #3                 // endQPosArray = tempQPosArray + 2*num_source_words
 379         lsl             rax, rax, #2
 380         subs    byte_count, byte_count, rax
 381         b.lt    L_budgetExhausted
 382
 383         cmp             rcx, next_qp                                                    // endQPosArray vs next_qp
 384         b.ls    2f                                                                              // if (next_qp >= endQPosArray) skip the following zero paddings
 385         sub             rax, rcx, next_qp
 386         mov             edx, #0
 387         tst             eax, #4
 388         b.eq    1f
 389         str             edx, [next_qp], #4
 390 1:      tst             eax, #2
 391         b.eq    1f
 392         strh    edx, [next_qp], #2
 393 1:      tst             eax, #1
 394         b.eq    2f
 395         strb    edx, [next_qp], #1
 396 2:
 397         mov             rdi, next_full_patt                                             // next_full_patt
 398         cmp             rcx, tempQPosArray                                              // endQPosArray vs tempQPosArray
 399         ldr             eax, [dest_buf]
 400         b.ls    L20                                                                             // if (endQPosArray <= tempQPosArray) skip the following
 401         mov             rdx, tempQPosArray                                              // tempQPosArray
 402
 403         /* packing 4-bits dict indices into dest_buf */
 404 L_pack_4bits:
 405         ldr             rax, [rdx], #8                                                  // src_next[1]:src_next[0]
 406         orr             rax, rax, rax, lsr #28                                  // eax = src_next[0] | (src_next[1] << 4)
 407         cmp             rcx, rdx                                                                // source_end vs src_next
 408         str             eax, [rdi], #4                                                  // *dest_next++ = temp;
 409         b.hi    L_pack_4bits                                                    // while (src_next < source_end) repeat the loop
 410
 411         // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
 412         sub             rax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 413         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in words
 414 L20:
 415         str             eax, [dest_buf,#4]                                              // dest_buf[1] = boundary_tmp - dest_buf
 416
 417
 418
 419         /*  --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[]   ----------------------------------------- */
 420
 421         add             rcx, scratch, #(2048*scale)                             // tempLowBitsArray
 422     sub         rdx, next_low_bits, rcx                                 // next_low_bits - tempLowBitsArray (in bytes)
 423         lsr             rdx, rdx, #1                                                    // num_tenbits_to_pack (in half-words)
 424         subs    edx, edx, #3                                                    // pre-decrement num_tenbits_to_pack by 3
 425         b.lt    1f                                                                              // if num_tenbits_to_pack < 3, skip the following loop
 426 0:
 427         subs    byte_count, byte_count, #4                              // byte_count -= 4
 428         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 429         subs    edx, edx, #3                                                    // num_tenbits_to_pack-=3
 430         ldr             rax, [rcx], #6
 431         bfm             rax, rax, #58, #9                                               // pack 1st toward 2nd
 432         bfm             rax, rax, #58, #25                                              // pack 1st/2nd toward 3rd
 433         lsr             rax, rax, #12
 434         str             eax, [rdi], #4                                                  // pack w0,w1,w2 into 1 dest_buf word
 435         b.ge    0b                                                                              // if no less than 3 elements, back to loop head
 436
 437 1:      adds    edx, edx, #3                                                    // post-increment num_tenbits_to_pack by 3
 438         b.eq    3f                                                                              // if num_tenbits_to_pack is a multiple of 3, skip the following
 439         subs    byte_count, byte_count, #4                              // byte_count -= 4
 440         b.le    L_budgetExhausted                                               // return -1 to signal this page is not compressable
 441         ldrh    eax,[rcx]                                                               // w0
 442         subs    edx, edx, #1                                                    // num_tenbits_to_pack--
 443         b.eq    2f                                                                              //
 444         ldrh    edx, [rcx, #2]                                                  // w1
 445         orr             eax, eax, edx, lsl #10                                  // w0 | (w1<<10)
 446
 447 2:      str             eax, [rdi], #4                                                  // write the final dest_buf word
 448
 449 3:      sub             rax, rdi, dest_buf                                              // boundary_tmp - dest_buf
 450         lsr             eax, eax, #2                                                    // boundary_tmp - dest_buf in terms of words
 451         str             eax, [dest_buf, #8]                                             // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
 452         lsl             w0, eax, #2                                                             // boundary_tmp - dest_buf in terms of bytes
 453
 454 L_done:
 455
 456         // restore registers and return
 457         mov             x15, sp
 458     ldp     x20, x21, [x15, #0]             // restore x20, x21
 459     ldp     x22, x23, [x15, #16]            // restore x22, x23
 460     ldp     x24, x25, [x15, #32]            // restore x24, x25
 461     ldp     x26, x27, [x15, #48]            // restore x26, x27
 462     add     sp, sp, #128                                        // deallocate for dictionary + saved register space
 463
 464 #if KERNEL
 465         ld1.4s  {v0,v1,v2,v3},[sp],#64
 466 #endif
 467         ret             lr
 468
 469     .align  4
 470 L_budgetExhausted:
 471     mov     x0, #-1
 472     b       L_done
 473
 474
 475         .align 4,0x90
 476 L_RECORD_EXACT:
 477 /*
 478                 we have an exact match of the input word to its corresponding dictionary word
 479                 write tag/dict_index to the temorary buffers
 480 */
 481         mov             eax, #3
 482         lsr             w14, wdict_location, #2                         // divide by 4 for word offset
 483         subs    remaining, remaining, #1                        // remaing--;
 484         strb    eax, [next_tag], #1                                     // *next_tag++ = 3 for exact
 485         strb    w14, [next_qp], #1                                      // *next_qp = word offset (4-bit)
 486         b.gt    L_loop
 487         b               CHECKPOINT                                              // if remaining = 0, break
 488
 489         .align 4,0x90
 490 L_RECORD_PARTIAL:
 491 /*
 492                 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
 493                 write tag/dict_index/low 10 bits to the temorary buffers
 494 */
 495         mov             ecx, #1
 496         strb    ecx, [next_tag], #1                                     // *next_tag++ = 1 for partial matched
 497         str             edx, [dictionary, dict_location]        // *dict_location = input_word;
 498         subs    remaining, remaining, #1                        // remaing--;
 499         lsr             eax, wdict_location, #2                         // offset in 32-bit word
 500         and             edx, edx, #1023                                         // lower 10 bits
 501         strb    eax, [next_qp], #1                                      // update *next_qp++
 502         strh    edx, [next_low_bits], #2                        // save next_low_bits++
 503         b.gt    L_loop
 504
 505 CHECKPOINT:
 506
 507     cbz     mode, L_check_compression_ratio             // if this this an early abort check..
 508
 509 L_check_zero_page:
 510
 511     cmp     start_next_full_patt, next_full_patt        // check if any dictionary misses in page
 512     b.ne    L_check_single_value_page
 513
 514     cmp     start_next_qp, next_qp                      // check if any partial or exact dictionary matches
 515     b.ne    L_check_single_value_page
 516
 517     mov     x0, #SV_RETURN                              // Magic return value
 518     b       L_done
 519
 520 L_check_single_value_page:
 521
 522     sub     rax, next_full_patt, start_next_full_patt   // get # dictionary misses
 523     lsr     rax, rax, #2
 524
 525     sub     r11, next_qp, start_next_qp                 // get # dictionary hits (exact + partial)
 526
 527     sub     r13, next_low_bits, start_next_low_bits     // get # dictionary partial hits
 528     lsr     r13, r13, #1
 529
 530     // Single value page if one of the follwoing is true:
 531     //  partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss)
 532     //  partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial)
 533     //
 534     cbnz    r13, 1f                                     // were there 0 partial hits?
 535
 536     cmp     r11, #(256*PAGES_SIZE_IN_KBYTES - 1)        // were there 1023 dictionary hits
 537     b.ne    1f
 538
 539     cmp     rax, #1                                     // was there exacly 1 dictionary miss?
 540     b.ne    1f
 541
 542     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 543     cmp     edx, #2                                     // was the very 1st tag a miss?
 544     b.eq    L_is_single_value_page
 545
 546 1:
 547     cmp     r13, #1                                     // was there 1 partial hit?
 548     b.ne    L_check_mostly_zero
 549
 550     cmp     r11, #(256*PAGES_SIZE_IN_KBYTES)           // were there 1024 dictionary hits
 551     b.ne    L_check_mostly_zero
 552
 553     ldrb    edx, [tempTagsArray]                        // read the very 1st tag
 554     cmp     edx, #1                                     // was the very 1st tag a partial?
 555     b.ne    L_check_mostly_zero
 556
 557 L_is_single_value_page:
 558
 559     mov     x0, #SV_RETURN                              // Magic return value
 560     b       L_done
 561
 562 L_check_mostly_zero:
 563                                                         // how much space will the sparse packer take?
 564     add     rax, rax, r11                               // rax += (next_qp - start_next_qp)
 565     mov     rdx, #6
 566     mov     rcx, #4
 567     madd    r11, rax, rdx, rcx                          // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
 568
 569     sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
 570     mov     rdx, #1365
 571     mul     rax, rax, rdx
 572
 573     sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
 574     add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 575
 576     sub     rdx, next_qp, start_next_qp
 577     add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
 578     add     rax, rax, #(12+256*scale)                   // rax += bytes taken by the header + tags
 579
 580     cmp     rax, r11                                    // is the default packer the better option?
 581     b.lt    L_done_search
 582
 583     cmp     r11, byte_budget                            // can the sparse packer fit into the given budget?
 584     b.gt    L_budgetExhausted
 585
 586 L_sparse_packer:
 587     mov     edx, #MZV_MAGIC
 588     str     edx, [dest_buf], #4                         // header to indicate a sparse packer
 589
 590     mov     rdx, #0                                     // rdx = byte offset in src of non-0 word
 591 1:
 592     ldr     rax, [start_next_input_word, rdx]           // rax = read dword
 593     cbnz    rax, 5f                                     // is dword != 0
 594 3:
 595     add     rdx, rdx, #8                                // 8 more bytes have been processed
 596 4:
 597     cmp     rdx, #(4096*scale)                          // has the entire page been processed
 598     b.ne    1b
 599     mov     x0, r11                                     // store the size of the compressed stream
 600     b       L_done
 601
 602 5:
 603     cbz     eax, 6f                                     // is lower word == 0
 604     str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
 605     strh    edx, [dest_buf], #2                         // store the byte index
 606 6:
 607     lsr     rax, rax, 32                                // get the upper word into position
 608     cbz     eax, 3b                                     // is dword == 0
 609     add     rdx, rdx, #4
 610     str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
 611     strh    edx, [dest_buf], #2                         // store the byte index
 612     add     rdx, rdx, #4
 613     b       4b
 614
 615 L_check_compression_ratio:
 616
 617     mov     mode, NORMAL
 618         mov             remaining, #((1024 - CHKPT_WORDS)*scale)    // remaining input words to process
 619     cbz     remaining, CHECKPOINT                       // if there are no remaining words to process
 620
 621     sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
 622     mov     rdx, #1365
 623     mul     rax, rax, rdx
 624
 625     sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
 626     add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
 627
 628     sub     rdx, next_qp, start_next_qp
 629     add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
 630     subs    rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale)
 631                                                         // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES
 632
 633     b.gt    L_budgetExhausted                           // if rax is > 0, we need to early abort
 634     b       L_loop                                      // we are done