2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor.
32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
35 src_buf : address of input compressed data buffer
36 dest_buf : address of output decompressed buffer
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38 words : this argument is not used in the implementation
42 the input buffer is decompressed and the dest_buf is written with decompressed data.
44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s
46 The bit stream (*src_buf) consists of
48 b. 256 bytes for 1024 packed tags
49 c. (varying number of) words for new words not matched to dictionary word.
50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
57 for (i=0;i<1024;i++) {
60 case 0 : *dest_buf++ = 0; break;
61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break;
68 Added zero page, single value page, sparse page, early abort optimizations
72 #define MZV_MAGIC $17185 // magic value used to identify MZV page encoding
76 .globl _WKdm_decompress_new
79 // save registers, and allocate stack memory for local variables
89 movl 0(%rdi), %eax // read the 1st word from the header
90 cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)?
91 jne L_default_decompressor // default decompressor was used
93 // Mostly Zero Page Handling...
96 1: // Zero out the entire page
97 movq $0, 0(%rsi, %rax)
98 movq $0, 8(%rsi, %rax)
99 movq $0, 16(%rsi, %rax)
100 movq $0, 24(%rsi, %rax)
101 movq $0, 32(%rsi, %rax)
102 movq $0, 40(%rsi, %rax)
103 movq $0, 48(%rsi, %rax)
104 movq $0, 56(%rsi, %rax)
109 movq $4, %r12 // current byte position in src to read from
111 movl 0(%rdi, %r12), %eax // get the word
112 movzwq 4(%rdi, %r12), %rdx // get the index
113 movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer
114 addq $6, %r12 // 6 more bytes processed
115 cmpl %ecx, %r12d // finished processing all the bytes?
120 L_default_decompressor:
122 movq %rsi, %r12 // dest_buf
123 movq %rdx, %r13 // scratch_buf
125 // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp)
126 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
145 mov $0x100000001, %rax
156 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
158 leaq 268(%rdi), %r10 // TAGS_AREA_END
159 leaq 12(%rdi), %rax // TAGS_AREA_START
160 movq %r13, %rsi // tempTagsArray
161 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START
162 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
163 movq %r13, %rcx // next_word
164 xorl %r8d, %r8d // i = 0
165 mov $(50529027<<32)+50529027, %r9
167 movl 12(%rdi,%r8, 4), %eax
168 movl 12(%rdi,%r8, 4), %edx
179 addq $16, %rcx // next_tags += 16
180 cmpq $64, %r8 // i vs 64
181 jne L_WK_unpack_2bits // repeat loop until i==64
185 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
187 mov 4(%rdi), %eax // WKdm header qpos end
188 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END
189 mov 0(%rdi), %eax // WKdm header qpos start
190 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START
191 leaq 1024(%r13), %rbx // tempQPosArray
192 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
193 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
194 leaq 8(%rbx), %rcx // next_qpos
196 mov $(252645135<<32)+252645135, %r11
198 movl (%r8), %eax // w = *next_word
202 addq $4, %r8 // next_word++
205 addq $8, %rcx // next_qpos+=8
206 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
207 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START
212 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
214 movl 8(%rdi), %eax // LOW_BITS_AREA_END offset
215 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END
216 leaq 2048(%r13), %r11 // tempLowBitsArray
217 leaq 4094(%r13), %r13 // final tenbits addr
218 sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
219 jle 1f // if START>=END, skip L_WK_unpack_3_tenbits
220 movq %r11, %rcx // next_low_bits
221 L_WK_unpack_3_tenbits:
222 movl (%r9), %eax // w = *next_word, 0:c:b:a
223 movl $(1023<<10), %edx
224 movl $(1023<<20), %r8d
225 andl %eax, %edx // b << 10
226 andl %eax, %r8d // c << 20
238 addq $4, %r9 // next_word++
239 addq $6, %rcx // next_low_bits += 3
241 jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
245 #define next_qpos %rbx
247 #define tags_counter %edi
248 #define dest_buf %r12
249 #define next_full_patt %r10
251 leaq _hashLookupTable_new(%rip), hash // hash look up table
252 movl $1024, tags_counter // tags_counter
262 movzbl (next_qpos), %eax // qpos = *next_qpos
263 incq next_qpos // next_qpos++
264 decl tags_counter // tags_counter--
265 movl (%rsp,%rax,4), %eax // w = dictionary[qpos]
266 movl %eax, -4(dest_buf) // *dest_buf = w
270 incq %rsi // next_tag++
276 movzbl (next_qpos),%edx // qpos = *next_qpos
277 incq next_qpos // next_qpos++
278 movl (%rsp,%rdx,4), %eax // read dictionary word
279 andl $-1024, %eax // clear lower 10 bits
280 or (%r11), %ax // pad the lower 10-bits from *next_low_bits
281 addq $2, %r11 // next_low_bits++
282 decl tags_counter // tags_counter--
283 movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word
284 movl %eax, -4(dest_buf) // *dest_buf = newly formed word
285 jg L_next // repeat loop until next_tag==tag_area_end
289 // release stack memory, restore registers, and return
291 addq $(64+8+16), %rsp
300 movl (next_full_patt), %edx // w = *next_full_patt
301 movl (next_full_patt), %eax // w = *next_full_patt
302 shrl $10, %edx // w>>10
303 addq $4, next_full_patt // next_full_patt++
304 movzbl %dl, %edx // 8-bit hash table index
305 movl %eax, -4(dest_buf) // *dest_buf = word
306 movzbl (hash,%rdx),%edx // qpos
307 decl tags_counter // tags_counter--
308 movl %eax, (%rsp,%rdx) // dictionary[qpos] = word
309 jg L_next // repeat the loop
314 decl tags_counter // tags_counter--
315 movl $0, -4(dest_buf) // *dest_buf = 0
316 jg L_next // repeat the loop