osfmk/arm64/WKdmDecompress_4k.s

   1 /*
   2  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  This file contains arm64 hand optimized implementation of WKdm memory page decompressor.
  31
  32         void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word *scratch, __unused__ unsigned int words);
  33
  34         input :
  35                 src_buf : address of input compressed data buffer
  36                 dest_buf : address of output decompressed buffer
  37                 scratch : an 8-k bytes scratch mempro provided by the caller
  38                 words : this argument is not used in the implementation
  39         (The 4th argument is, in fact, used by the Mostly Zero Value decoder)
  40
  41         output :
  42
  43                 the input buffer is decompressed and the dest_buf is written with decompressed data.
  44
  45         Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress arm64 assembly code WKdmCompress.s
  46
  47         The bit stream (*src_buf) consists of
  48                 a. 12 bytes header
  49                 b. 256 bytes for 1024 packed tags
  50                 c. (varying number of) words for new words not matched to dictionary word.
  51                 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
  52                 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
  53
  54         where the header (of 3 words) specifies the ending boundaries (in 32-bit words) of the bit stream of c,d,e, respectively.
  55
  56         The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
  57
  58                 for (i=0;i<1024;i++) {
  59                         tag = *next_tag++
  60                         switch (tag) {
  61                                 case 0 : *dest_buf++ = 0; break;
  62                                 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
  63                                 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
  64                                 case 3 : *dest_buf++ = dictionary[*dict_index++];  break;
  65                         }
  66
  67         cclee, Nov 9, '12
  68
  69     Added zero page, single value page, sparse page, early abort optimizations
  70     rsrini, 09/14/14
  71 */
  72
  73 #define MZV_MAGIC               17185      // magic value used to identify MZV page encoding
  74
  75 #ifndef PAGES_SIZE_IN_KBYTES
  76 #define PAGES_SIZE_IN_KBYTES    4
  77 #endif
  78
  79 #if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16))
  80 #error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
  81 #endif
  82
  83 #define scale (PAGES_SIZE_IN_KBYTES/4)
  84
  85
  86         .align  4
  87         .text
  88
  89 /*
  90          void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes);
  91 */
  92
  93         .globl _WKdm_decompress_4k
  94 _WKdm_decompress_4k:
  95
  96         /*
  97                         --------   symbolizing registers --------
  98                         the arm64 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names.
  99         */
 100
 101         #define src_buf                 x0
 102         #define dest_buf                x1
 103         #define scratch                 x2
 104     #define n_bytes         x3
 105         #define dictionary              sp
 106         #define rax                             x13
 107         #define eax                             w13
 108         #define rbx                             x4
 109         #define ebx                             w4
 110         #define rcx                             x5
 111         #define ecx                             w5
 112         #define rdx                             x6
 113         #define edx                             w6
 114         #define tags_counter    x7
 115         #define next_tag                x12
 116         #define r8                              x8
 117         #define r9                              x9
 118         #define r10                             x10
 119         #define r11                             x11
 120     #define r12             x12
 121
 122         /*
 123
 124                 ------   scratch memory for local variables  ---------
 125
 126     [sp,#0] : dictionary
 127     [scratch,#0] : tempTagsArray
 128     [scratch,#1024] : tempQPosArray
 129     [scratch,#2048] : tempLowBitsArray
 130
 131         */
 132
 133 #if KERNEL
 134         sub             rax, sp, #96
 135         sub             sp, sp, #96
 136         st1.4s  {v0,v1,v2},[rax],#48
 137         st1.4s  {v3,v4,v5},[rax],#48
 138 #endif
 139
 140         sub             sp, sp, #64
 141
 142     ldr     eax, [src_buf]                      // read the 1st word from the header
 143     mov     ecx, #MZV_MAGIC
 144     cmp     eax, ecx                            // is the alternate packer used (i.e. is MZV page)?
 145     b.ne    L_default_decompressor              // default decompressor was used
 146
 147                                                 // Mostly Zero Page Handling...
 148                                                 // {
 149     add     src_buf, src_buf, 4                 // skip the header
 150     mov     rax, dest_buf
 151     mov     rcx, #(PAGES_SIZE_IN_KBYTES*1024)   // number of bytes to zero out
 152 1:
 153     dc      zva, rax                            // zero 64 bytes. since dest_buf is a page, it will be 4096 or 16384 byte aligned
 154     add     rax, rax, #64
 155     dc      zva, rax
 156     add     rax, rax, #64
 157     dc      zva, rax
 158     add     rax, rax, #64
 159     dc      zva, rax
 160     add     rax, rax, #64
 161     subs    rcx, rcx, #256
 162     b.ne    1b
 163
 164     mov     r12, #4                             // current byte position in src to read from
 165     mov     rdx, #0
 166 2:
 167     ldr     eax, [src_buf], #4                  // get the word
 168     ldrh    edx, [src_buf], #2                  // get the index
 169     str     eax, [dest_buf, rdx]                // store non-0 word in the destination buffer
 170     add     r12, r12, #6                        // 6 more bytes processed
 171     cmp     r12, n_bytes                        // finished processing all the bytes?
 172     b.ne    2b
 173     b       L_done
 174                                                 // }
 175
 176 L_default_decompressor:
 177
 178     /*
 179                         ---------------------- set up registers and PRELOAD_DICTIONARY ---------------------------------
 180         */
 181     // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
 182         adrp    rbx, _table_2bits@GOTPAGE
 183     stp     xzr, xzr, [dictionary, #0]
 184         add             r10, src_buf, #(12+256*scale)           // TAGS_AREA_END
 185     stp     xzr, xzr, [dictionary, #16]
 186         add             rax, src_buf, #12                       // TAGS_AREA_START
 187     ldr     rbx, [rbx, _table_2bits@GOTPAGEOFF]
 188     stp     xzr, xzr, [dictionary, #32]
 189         mov             rcx, scratch                            // tempTagsArray
 190     stp     xzr, xzr, [dictionary, #48]
 191         ld1.4s  {v0,v1},[rbx]
 192
 193
 194         /*
 195                         ------------------------------  unpacking bit stream ----------------------------------
 196         */
 197
 198         // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
 199 /*
 200         unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes
 201     for arm64, this can be done by
 202                 1. read the input 32-bit word into GPR w
 203         2. duplicate GPR into 4 elements in a vector register v0
 204         3. ushl.4s vd, v0, vshift   where vshift = {0, -2, -4, -6}
 205         4. and.4s  vd, vd, vmask    where vmask = 0x03030303030303030303030303030303
 206 */
 207
 208 L_WK_unpack_2bits:
 209         ldr             q5, [rax], #16                          // read 4 32-bit words for 64 2-bit tags
 210         dup.4s  v2, v5[0]                                       // duplicate to 4 elements
 211         dup.4s  v3, v5[1]                                       // duplicate to 4 elements
 212         dup.4s  v4, v5[2]                                       // duplicate to 4 elements
 213         dup.4s  v5, v5[3]                                       // duplicate to 4 elements
 214         ushl.4s v2, v2, v0                                      // v1 = {0, -2, -4, -6}
 215         ushl.4s v3, v3, v0                                      // v1 = {0, -2, -4, -6}
 216         ushl.4s v4, v4, v0                                      // v1 = {0, -2, -4, -6}
 217         ushl.4s v5, v5, v0                                      // v1 = {0, -2, -4, -6}
 218         and.16b v2, v2, v1                                      // v2 = {3,3,...,3}
 219         and.16b v3, v3, v1                                      // v2 = {3,3,...,3}
 220         and.16b v4, v4, v1                                      // v2 = {3,3,...,3}
 221         and.16b v5, v5, v1                                      // v2 = {3,3,...,3}
 222         cmp             r10, rax                                        // TAGS_AREA_END vs TAGS_AREA_START
 223         st1.4s  {v2,v3,v4,v5}, [rcx], #64       // write 64 tags into tempTagsArray
 224         b.hi    L_WK_unpack_2bits                       // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits
 225
 226
 227         // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
 228
 229         ldp             w8, w9, [src_buf]                       // WKdm header qpos start and end
 230         adrp    rbx, _table_4bits@GOTPAGE
 231         subs    x14, r9, r8                                     // x14 = (QPOS_AREA_END - QPOS_AREA_START)/4
 232         add             r8, src_buf, r8, lsl #2         // QPOS_AREA_START
 233         add             r9, src_buf, r9, lsl #2         // QPOS_AREA_END
 234
 235         b.ls    1f                                                      // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
 236     ldr     rbx, [rbx, _table_4bits@GOTPAGEOFF]
 237         add             rcx, scratch, #(1024*scale)             // tempQPosArray
 238         ld1.4s  {v0,v1},[rbx]
 239         subs    w14, w14, #1
 240         b.ls    2f                                                      // do loop of 2 only if w14 >= 5
 241 L_WK_unpack_4bits:
 242         ldr             d2, [r8], #8                            // read a 32-bit word for 8 4-bit positions
 243         subs    w14, w14, #2
 244         zip1.4s v2, v2, v2
 245         ushl.4s v2, v2, v0                                      // v1 = {0, -4, 0, -4}
 246         and.16b v2, v2, v1                                      // v2 = {15,15,...,15}
 247         str             q2, [rcx], #16
 248         b.hi    L_WK_unpack_4bits
 249 2:
 250         adds    w14, w14, #1
 251         b.le    1f
 252
 253         ldr             s3, [r8], #4                            // read a 32-bit word for 8 4-bit positions
 254         dup.2s  v2, v3[0]                                       // duplicate to 2 elements
 255         ushl.2s v2, v2, v0                                      // v1 = {0, -4}
 256         and.8b  v2, v2, v1                                      // v2 = {15,15,...,15}
 257         str             d2, [rcx], #8                           // write 16 tags into tempTagsArray
 258
 259 1:
 260
 261         // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
 262
 263         ldr             eax, [src_buf,#8]                       // LOW_BITS_AREA_END offset
 264         add             r8, src_buf, rax, lsl #2        // LOW_BITS_AREA_END
 265         add             rcx, scratch, #(2048*scale)     // tempLowBitsArray
 266 #if (scale==1)
 267         add             r11, scratch, #(4096*scale-2)   // final tenbits for the rare case
 268 #else
 269         add             r11, scratch, #(4096*scale)     // final tenbits for the rare case
 270         sub             r11, r11, #2
 271 #endif
 272         subs    r8, r8, r9                                      // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
 273         b.ls    1f                                                      // if START>=END, skip L_WK_unpack_3_tenbits
 274
 275         adrp    rbx, _table_10bits@GOTPAGE
 276     ldr     rbx, [rbx, _table_10bits@GOTPAGEOFF]
 277         ld1.4s  {v0,v1,v2,v3},[rbx]
 278
 279         /*
 280                 a very rare case : 1024 tenbits, 1023 + 1 -> 341 + final 1 that is padded with 2 zeros
 281                 since the scratch memory is 4k (2k for this section), we need to pay attention to the last case
 282                 so we don't overwrite to the scratch memory
 283
 284                 we 1st do a single 3_tenbits, followed by 2x_3_tenbits loop, and detect whether the last 3_tenbits
 285                 hits the raee case
 286         */
 287 #if 1
 288         subs    r8, r8, #4                                      // pre-decrement by 8
 289         ldr             s4, [r9], #4                            // read 32-bit words for 3 low 10-bits
 290         zip1.4s v4,     v4,     v4      // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice.
 291         ushl.4s v5,     v4,     v0      // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89.
 292         ushl.4s v4,     v4,     v1      // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105.
 293         and.16b v5,     v5,     v2      // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets.
 294         and.16b v4,     v4,     v3      // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets
 295         orr.16b v4,     v4,     v5      // combine data
 296         str             d4, [rcx], #6                           // write 3 low 10-bits
 297         b.eq    1f
 298 #endif
 299
 300         subs    r8, r8, #8                                      // pre-decrement by 8
 301         b.lt    L_WK_unpack_3_tenbits
 302
 303 L_WK_unpack_2x_3_tenbits:
 304         ldr             d4, [r9], #8                            // read 2 32-bit words for a pair of 3 low 10-bits
 305         zip1.4s v4,     v4,     v4      // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice.
 306         ushl.4s v5,     v4,     v0      // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89.
 307         ushl.4s v4,     v4,     v1      // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105.
 308         and.16b v5,     v5,     v2      // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets.
 309         and.16b v4,     v4,     v3      // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets
 310         orr.16b v4,     v4,     v5      // combine data
 311         ins             v5.d[0], v4.d[1]
 312         str             d4, [rcx], #6                           // write 3 low 10-bits
 313         str             d5, [rcx], #6                           // write 3 low 10-bits
 314
 315         subs    r8, r8, #8
 316         b.ge    L_WK_unpack_2x_3_tenbits                // repeat loop if LOW_BITS_AREA_END > next_word
 317
 318         tst             r8, #4
 319         b.eq    1f
 320
 321 L_WK_unpack_3_tenbits:
 322         ldr             s4, [r9]                                        // read 32-bit words for 3 low 10-bits
 323         zip1.4s v4,     v4,     v4      // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice.
 324         ushl.4s v5,     v4,     v0      // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89.
 325         ushl.4s v4,     v4,     v1      // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105.
 326         and.16b v5,     v5,     v2      // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets.
 327         and.16b v4,     v4,     v3      // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets
 328         orr.16b v4,     v4,     v5      // combine data
 329 #if 0
 330         str             d4, [rcx]       // write 3 low 10-bits
 331 #else
 332         cmp             rcx, r11
 333         b.eq    2f
 334         str             d4, [rcx]       // write 3 low 10-bits
 335         b               1f
 336 2:
 337         str             h4, [rcx]       // write final 1 low 10-bits
 338 #endif
 339 1:
 340
 341         /*
 342                 set up before going to the main decompress loop
 343         */
 344         mov             next_tag, scratch                               // tempTagsArray
 345         add             r8, scratch, #(1024*scale)              // next_qpos
 346         add             r11, scratch, #(2048*scale)             // tempLowBitsArray
 347         adrp    rbx, _hashLookupTable@GOTPAGE
 348         mov             tags_counter, #(1024*scale)             // tag_area_end
 349     ldr     rbx, [rbx, _hashLookupTable@GOTPAGEOFF]
 350
 351         b               L_next
 352
 353         .align 4,0x90
 354 L_ZERO_TAG:
 355         /*
 356                 we can only get here if w9 = 0, meaning this is a zero tag
 357                 *dest_buf++ = 0;
 358         */
 359         str             w9, [dest_buf], #4                              // *dest_buf++ = 0
 360         subs    tags_counter, tags_counter, #1  // next_tag vs tag_area_end
 361         b.ls    L_done                                                  // if next_tag >= tag_area_end, we're done
 362
 363         /* WKdm decompress main loop */
 364 L_next:
 365         ldrb    w9, [next_tag], #1                              // new tag
 366         cbz             w9, L_ZERO_TAG
 367         cmp             w9, #2                                                  // partial match tag ?
 368         b.eq    L_MISS_TAG
 369         b.gt    L_EXACT_TAG
 370
 371 L_PARTIAL_TAG:
 372         /*
 373                         this is a partial match:
 374                                 dict_word = dictionary[*dict_index];
 375                                 dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++;
 376         */
 377
 378         ldrb    edx, [r8], #1                                   // qpos = *next_qpos++
 379         ldrh    ecx, [r11], #2                                  // lower 10-bits from *next_low_bits++
 380         ldr             eax, [dictionary, rdx, lsl #2]  // read dictionary word
 381         bfm             eax, ecx, #0, #9                                // pad the lower 10-bits from *next_low_bits
 382         str             eax, [dictionary,rdx,lsl #2]    // *dict_location = newly formed word
 383         str             eax, [dest_buf], #4                             // *dest_buf++ = newly formed word
 384         subs    tags_counter, tags_counter, #1  // next_tag vs tag_area_end
 385         b.gt    L_next                                                  // repeat loop until next_tag==tag_area_end
 386
 387 L_done:
 388
 389         // release stack memory, restore registers, and return
 390         add             sp, sp, #64                                     // deallocate for dictionary
 391 #if KERNEL
 392         ld1.4s  {v0,v1,v2},[sp],#48
 393         ld1.4s  {v3,v4,v5},[sp],#48
 394 #endif
 395         ret             lr
 396
 397         .align 4,0x90
 398 L_MISS_TAG:
 399         /*
 400                 this is a dictionary miss.
 401                         x = *new_word++;
 402                         k = (x>>10)&255;
 403                         k = hashTable[k];
 404                         dictionary[k] = *dest_buf++ = x;
 405         */
 406         ldr             eax, [r10], #4                                  // w = *next_full_patt++
 407         ubfm    edx, eax, #10, #17                              // 8-bit hash table index
 408         str             eax, [dest_buf], #4                             // *dest_buf++ = word
 409         ldrb    edx, [rbx, rdx]                                 // qpos
 410         str             eax, [dictionary,rdx]                   // dictionary[qpos] = word
 411         subs    tags_counter, tags_counter, #1  // next_tag vs tag_area_end
 412         b.gt    L_next                                                  // repeat the loop
 413         b               L_done                                                  // if next_tag >= tag_area_end, we're done
 414
 415         .align 4,0x90
 416 L_EXACT_TAG:
 417         /*
 418                         this is an exact match;
 419                         *dest_buf++ = dictionary[*dict_index++];
 420         */
 421         ldrb    eax, [r8], #1                                   // qpos = *next_qpos++
 422         ldr             eax, [dictionary,rax,lsl #2]    // w = dictionary[qpos]
 423         str             eax, [dest_buf], #4                             // *dest_buf++ = w
 424         subs    tags_counter, tags_counter, #1  // next_tag vs tag_area_end
 425         b.gt    L_next                                                  // repeat the loop
 426         b               L_done                                                  // if next_tag >= tag_area_end, we're done
 427
 428