/* * Copyright (c) 2000-2013 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* This file contains x86_64 hand optimized implementation of WKdm memory page decompressor. void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); input : src_buf : address of input compressed data buffer dest_buf : address of output decompressed buffer scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller words : this argument is not used in the implementation output : the input buffer is decompressed and the dest_buf is written with decompressed data. Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s The bit stream (*src_buf) consists of a. 12 bytes header b. 256 bytes for 1024 packed tags c. (varying number of) words for new words not matched to dictionary word. d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows for (i=0;i<1024;i++) { tag = *next_tag++ switch (tag) { case 0 : *dest_buf++ = 0; break; case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; case 3 : *dest_buf++ = dictionary[*dict_index++]; break; } cclee, 11/30/12 Added zero page, single value page, sparse page, early abort optimizations rsrini, 09/14/14 */ #define MZV_MAGIC $17185 // magic value used to identify MZV page encoding .text .globl _WKdm_decompress_new _WKdm_decompress_new: // save registers, and allocate stack memory for local variables pushq %rbp movq %rsp, %rbp pushq %r12 pushq %r13 pushq %rbx subq $(64+8+16), %rsp movl 0(%rdi), %eax // read the 1st word from the header cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)? jne L_default_decompressor // default decompressor was used // Mostly Zero Page Handling... // { movq $0, %rax 1: // Zero out the entire page movq $0, 0(%rsi, %rax) movq $0, 8(%rsi, %rax) movq $0, 16(%rsi, %rax) movq $0, 24(%rsi, %rax) movq $0, 32(%rsi, %rax) movq $0, 40(%rsi, %rax) movq $0, 48(%rsi, %rax) movq $0, 56(%rsi, %rax) addq $64, %rax cmpq $4096, %rax jne 1b movq $4, %r12 // current byte position in src to read from 2: movl 0(%rdi, %r12), %eax // get the word movzwq 4(%rdi, %r12), %rdx // get the index movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer addq $6, %r12 // 6 more bytes processed cmpl %ecx, %r12d // finished processing all the bytes? jne 2b jmp L_done // } L_default_decompressor: movq %rsi, %r12 // dest_buf movq %rdx, %r13 // scratch_buf // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp) // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR #if 1 movl $0, 0(%rsp) movl $0, 4(%rsp) movl $0, 8(%rsp) movl $0, 12(%rsp) movl $0, 16(%rsp) movl $0, 20(%rsp) movl $0, 24(%rsp) movl $0, 28(%rsp) movl $0, 32(%rsp) movl $0, 36(%rsp) movl $0, 40(%rsp) movl $0, 44(%rsp) movl $0, 48(%rsp) movl $0, 52(%rsp) movl $0, 56(%rsp) movl $0, 60(%rsp) #else mov $0x100000001, %rax mov %rax, (%rsp) mov %rax, 8(%rsp) mov %rax, 16(%rsp) mov %rax, 24(%rsp) mov %rax, 32(%rsp) mov %rax, 40(%rsp) mov %rax, 48(%rsp) mov %rax, 56(%rsp) #endif // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); leaq 268(%rdi), %r10 // TAGS_AREA_END leaq 12(%rdi), %rax // TAGS_AREA_START movq %r13, %rsi // tempTagsArray cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits movq %r13, %rcx // next_word xorl %r8d, %r8d // i = 0 mov $(50529027<<32)+50529027, %r9 L_WK_unpack_2bits: movl 12(%rdi,%r8, 4), %eax movl 12(%rdi,%r8, 4), %edx shrl $2, %eax shlq $32, %rax orq %rdx, %rax movq %rax, %rdx shrq $4, %rax andq %r9, %rdx andq %r9, %rax incq %r8 // i++ movq %rdx, (%rcx) movq %rax, 8(%rcx) addq $16, %rcx // next_tags += 16 cmpq $64, %r8 // i vs 64 jne L_WK_unpack_2bits // repeat loop until i==64 1: // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); mov 4(%rdi), %eax // WKdm header qpos end leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END mov 0(%rdi), %eax // WKdm header qpos start leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START leaq 1024(%r13), %rbx // tempQPosArray cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits leaq 8(%rbx), %rcx // next_qpos mov $(252645135<<32)+252645135, %r11 L_WK_unpack_4bits: movl (%r8), %eax // w = *next_word movl %eax, %edx // w shlq $28, %rax orq %rdx, %rax addq $4, %r8 // next_word++ andq %r11, %rax movq %rax, -8(%rcx) addq $8, %rcx // next_qpos+=8 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START 1: // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); movl 8(%rdi), %eax // LOW_BITS_AREA_END offset leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END leaq 2048(%r13), %r11 // tempLowBitsArray leaq 4094(%r13), %r13 // final tenbits addr sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END jle 1f // if START>=END, skip L_WK_unpack_3_tenbits movq %r11, %rcx // next_low_bits L_WK_unpack_3_tenbits: movl (%r9), %eax // w = *next_word, 0:c:b:a movl $(1023<<10), %edx movl $(1023<<20), %r8d andl %eax, %edx // b << 10 andl %eax, %r8d // c << 20 andq $1023, %rax shll $6, %edx shlq $12, %r8 orl %edx, %eax orq %r8, %rax cmp %r13, %rcx je 2f mov %rax, (%rcx) jmp 3f 2: mov %ax, (%rcx) 3: addq $4, %r9 // next_word++ addq $6, %rcx // next_low_bits += 3 sub $4, %rdi jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 1: #define next_qpos %rbx #define hash %r8 #define tags_counter %edi #define dest_buf %r12 #define next_full_patt %r10 leaq _hashLookupTable_new(%rip), hash // hash look up table movl $1024, tags_counter // tags_counter jmp L_next .align 4,0x90 L_nonpartital: jl L_ZERO_TAG cmpb $2, -1(%rsi) je L_MISS_TAG L_EXACT_TAG: movzbl (next_qpos), %eax // qpos = *next_qpos incq next_qpos // next_qpos++ decl tags_counter // tags_counter-- movl (%rsp,%rax,4), %eax // w = dictionary[qpos] movl %eax, -4(dest_buf) // *dest_buf = w je L_done L_next: incq %rsi // next_tag++ addq $4, dest_buf cmpb $1, -1(%rsi) jne L_nonpartital L_PARTIAL_TAG: movzbl (next_qpos),%edx // qpos = *next_qpos incq next_qpos // next_qpos++ movl (%rsp,%rdx,4), %eax // read dictionary word andl $-1024, %eax // clear lower 10 bits or (%r11), %ax // pad the lower 10-bits from *next_low_bits addq $2, %r11 // next_low_bits++ decl tags_counter // tags_counter-- movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word movl %eax, -4(dest_buf) // *dest_buf = newly formed word jg L_next // repeat loop until next_tag==tag_area_end L_done: // release stack memory, restore registers, and return addq $(64+8+16), %rsp popq %rbx popq %r13 popq %r12 leave ret .align 4,0x90 L_MISS_TAG: movl (next_full_patt), %edx // w = *next_full_patt movl (next_full_patt), %eax // w = *next_full_patt shrl $10, %edx // w>>10 addq $4, next_full_patt // next_full_patt++ movzbl %dl, %edx // 8-bit hash table index movl %eax, -4(dest_buf) // *dest_buf = word movzbl (hash,%rdx),%edx // qpos decl tags_counter // tags_counter-- movl %eax, (%rsp,%rdx) // dictionary[qpos] = word jg L_next // repeat the loop jmp L_done .align 4,0x90 L_ZERO_TAG: decl tags_counter // tags_counter-- movl $0, -4(dest_buf) // *dest_buf = 0 jg L_next // repeat the loop jmp L_done