]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2000-2013 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains x86_64 hand optimized implementation of WKdm memory page decompressor. | |
31 | ||
32 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); | |
33 | ||
34 | input : | |
35 | src_buf : address of input compressed data buffer | |
36 | dest_buf : address of output decompressed buffer | |
37 | scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller | |
38 | words : this argument is not used in the implementation | |
39 | ||
40 | output : | |
41 | ||
42 | the input buffer is decompressed and the dest_buf is written with decompressed data. | |
43 | ||
44 | Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s | |
45 | ||
46 | The bit stream (*src_buf) consists of | |
47 | a. 12 bytes header | |
48 | b. 256 bytes for 1024 packed tags | |
49 | c. (varying number of) words for new words not matched to dictionary word. | |
50 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
51 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
52 | ||
53 | where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. | |
54 | ||
55 | The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows | |
56 | ||
57 | for (i=0;i<1024;i++) { | |
58 | tag = *next_tag++ | |
59 | switch (tag) { | |
60 | case 0 : *dest_buf++ = 0; break; | |
61 | case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; | |
62 | case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; | |
63 | case 3 : *dest_buf++ = dictionary[*dict_index++]; break; | |
64 | } | |
65 | ||
66 | cclee, 11/30/12 | |
67 | ||
68 | Added zero page, single value page, sparse page, early abort optimizations | |
69 | rsrini, 09/14/14 | |
70 | */ | |
71 | ||
72 | #define MZV_MAGIC $17185 // magic value used to identify MZV page encoding | |
73 | ||
74 | .text | |
75 | ||
76 | .globl _WKdm_decompress_new | |
77 | _WKdm_decompress_new: | |
78 | ||
79 | // save registers, and allocate stack memory for local variables | |
80 | ||
81 | pushq %rbp | |
82 | movq %rsp, %rbp | |
83 | pushq %r12 | |
84 | pushq %r13 | |
85 | pushq %rbx | |
86 | ||
87 | subq $(64+8+16), %rsp | |
88 | ||
89 | movl 0(%rdi), %eax // read the 1st word from the header | |
90 | cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)? | |
91 | jne L_default_decompressor // default decompressor was used | |
92 | ||
93 | // Mostly Zero Page Handling... | |
94 | // { | |
95 | movq $0, %rax | |
96 | 1: // Zero out the entire page | |
97 | movq $0, 0(%rsi, %rax) | |
98 | movq $0, 8(%rsi, %rax) | |
99 | movq $0, 16(%rsi, %rax) | |
100 | movq $0, 24(%rsi, %rax) | |
101 | movq $0, 32(%rsi, %rax) | |
102 | movq $0, 40(%rsi, %rax) | |
103 | movq $0, 48(%rsi, %rax) | |
104 | movq $0, 56(%rsi, %rax) | |
105 | addq $64, %rax | |
106 | cmpq $4096, %rax | |
107 | jne 1b | |
108 | ||
109 | movq $4, %r12 // current byte position in src to read from | |
110 | 2: | |
111 | movl 0(%rdi, %r12), %eax // get the word | |
112 | movzwq 4(%rdi, %r12), %rdx // get the index | |
113 | movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer | |
114 | addq $6, %r12 // 6 more bytes processed | |
115 | cmpl %ecx, %r12d // finished processing all the bytes? | |
116 | jne 2b | |
117 | jmp L_done | |
118 | // } | |
119 | ||
120 | L_default_decompressor: | |
121 | ||
122 | movq %rsi, %r12 // dest_buf | |
123 | movq %rdx, %r13 // scratch_buf | |
124 | ||
125 | // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp) | |
126 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR | |
127 | #if 1 | |
128 | movl $0, 0(%rsp) | |
129 | movl $0, 4(%rsp) | |
130 | movl $0, 8(%rsp) | |
131 | movl $0, 12(%rsp) | |
132 | movl $0, 16(%rsp) | |
133 | movl $0, 20(%rsp) | |
134 | movl $0, 24(%rsp) | |
135 | movl $0, 28(%rsp) | |
136 | movl $0, 32(%rsp) | |
137 | movl $0, 36(%rsp) | |
138 | movl $0, 40(%rsp) | |
139 | movl $0, 44(%rsp) | |
140 | movl $0, 48(%rsp) | |
141 | movl $0, 52(%rsp) | |
142 | movl $0, 56(%rsp) | |
143 | movl $0, 60(%rsp) | |
144 | #else | |
145 | mov $0x100000001, %rax | |
146 | mov %rax, (%rsp) | |
147 | mov %rax, 8(%rsp) | |
148 | mov %rax, 16(%rsp) | |
149 | mov %rax, 24(%rsp) | |
150 | mov %rax, 32(%rsp) | |
151 | mov %rax, 40(%rsp) | |
152 | mov %rax, 48(%rsp) | |
153 | mov %rax, 56(%rsp) | |
154 | #endif | |
155 | ||
156 | // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); | |
157 | ||
158 | leaq 268(%rdi), %r10 // TAGS_AREA_END | |
159 | leaq 12(%rdi), %rax // TAGS_AREA_START | |
160 | movq %r13, %rsi // tempTagsArray | |
161 | cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START | |
162 | jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits | |
163 | movq %r13, %rcx // next_word | |
164 | xorl %r8d, %r8d // i = 0 | |
165 | mov $(50529027<<32)+50529027, %r9 | |
166 | L_WK_unpack_2bits: | |
167 | movl 12(%rdi,%r8, 4), %eax | |
168 | movl 12(%rdi,%r8, 4), %edx | |
169 | shrl $2, %eax | |
170 | shlq $32, %rax | |
171 | orq %rdx, %rax | |
172 | movq %rax, %rdx | |
173 | shrq $4, %rax | |
174 | andq %r9, %rdx | |
175 | andq %r9, %rax | |
176 | incq %r8 // i++ | |
177 | movq %rdx, (%rcx) | |
178 | movq %rax, 8(%rcx) | |
179 | addq $16, %rcx // next_tags += 16 | |
180 | cmpq $64, %r8 // i vs 64 | |
181 | jne L_WK_unpack_2bits // repeat loop until i==64 | |
182 | 1: | |
183 | ||
184 | ||
185 | // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); | |
186 | ||
187 | mov 4(%rdi), %eax // WKdm header qpos end | |
188 | leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END | |
189 | mov 0(%rdi), %eax // WKdm header qpos start | |
190 | leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START | |
191 | leaq 1024(%r13), %rbx // tempQPosArray | |
192 | cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START | |
193 | jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits | |
194 | leaq 8(%rbx), %rcx // next_qpos | |
195 | ||
196 | mov $(252645135<<32)+252645135, %r11 | |
197 | L_WK_unpack_4bits: | |
198 | movl (%r8), %eax // w = *next_word | |
199 | movl %eax, %edx // w | |
200 | shlq $28, %rax | |
201 | orq %rdx, %rax | |
202 | addq $4, %r8 // next_word++ | |
203 | andq %r11, %rax | |
204 | movq %rax, -8(%rcx) | |
205 | addq $8, %rcx // next_qpos+=8 | |
206 | cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START | |
207 | ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START | |
208 | ||
209 | ||
210 | 1: | |
211 | ||
212 | // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); | |
213 | ||
214 | movl 8(%rdi), %eax // LOW_BITS_AREA_END offset | |
215 | leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END | |
216 | leaq 2048(%r13), %r11 // tempLowBitsArray | |
217 | leaq 4094(%r13), %r13 // final tenbits addr | |
218 | sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END | |
219 | jle 1f // if START>=END, skip L_WK_unpack_3_tenbits | |
220 | movq %r11, %rcx // next_low_bits | |
221 | L_WK_unpack_3_tenbits: | |
222 | movl (%r9), %eax // w = *next_word, 0:c:b:a | |
223 | movl $(1023<<10), %edx | |
224 | movl $(1023<<20), %r8d | |
225 | andl %eax, %edx // b << 10 | |
226 | andl %eax, %r8d // c << 20 | |
227 | andq $1023, %rax | |
228 | shll $6, %edx | |
229 | shlq $12, %r8 | |
230 | orl %edx, %eax | |
231 | orq %r8, %rax | |
232 | cmp %r13, %rcx | |
233 | je 2f | |
234 | mov %rax, (%rcx) | |
235 | jmp 3f | |
236 | 2: mov %ax, (%rcx) | |
237 | 3: | |
238 | addq $4, %r9 // next_word++ | |
239 | addq $6, %rcx // next_low_bits += 3 | |
240 | sub $4, %rdi | |
241 | jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word | |
242 | 1: | |
243 | ||
244 | ||
245 | #define next_qpos %rbx | |
246 | #define hash %r8 | |
247 | #define tags_counter %edi | |
248 | #define dest_buf %r12 | |
249 | #define next_full_patt %r10 | |
250 | ||
251 | leaq _hashLookupTable_new(%rip), hash // hash look up table | |
252 | movl $1024, tags_counter // tags_counter | |
253 | jmp L_next | |
254 | ||
255 | .align 4,0x90 | |
256 | L_nonpartital: | |
257 | jl L_ZERO_TAG | |
258 | cmpb $2, -1(%rsi) | |
259 | je L_MISS_TAG | |
260 | ||
261 | L_EXACT_TAG: | |
262 | movzbl (next_qpos), %eax // qpos = *next_qpos | |
263 | incq next_qpos // next_qpos++ | |
264 | decl tags_counter // tags_counter-- | |
265 | movl (%rsp,%rax,4), %eax // w = dictionary[qpos] | |
266 | movl %eax, -4(dest_buf) // *dest_buf = w | |
267 | je L_done | |
268 | ||
269 | L_next: | |
270 | incq %rsi // next_tag++ | |
271 | addq $4, dest_buf | |
272 | cmpb $1, -1(%rsi) | |
273 | jne L_nonpartital | |
274 | ||
275 | L_PARTIAL_TAG: | |
276 | movzbl (next_qpos),%edx // qpos = *next_qpos | |
277 | incq next_qpos // next_qpos++ | |
278 | movl (%rsp,%rdx,4), %eax // read dictionary word | |
279 | andl $-1024, %eax // clear lower 10 bits | |
280 | or (%r11), %ax // pad the lower 10-bits from *next_low_bits | |
281 | addq $2, %r11 // next_low_bits++ | |
282 | decl tags_counter // tags_counter-- | |
283 | movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word | |
284 | movl %eax, -4(dest_buf) // *dest_buf = newly formed word | |
285 | jg L_next // repeat loop until next_tag==tag_area_end | |
286 | ||
287 | L_done: | |
288 | ||
289 | // release stack memory, restore registers, and return | |
290 | ||
291 | addq $(64+8+16), %rsp | |
292 | popq %rbx | |
293 | popq %r13 | |
294 | popq %r12 | |
295 | leave | |
296 | ret | |
297 | ||
298 | .align 4,0x90 | |
299 | L_MISS_TAG: | |
300 | movl (next_full_patt), %edx // w = *next_full_patt | |
301 | movl (next_full_patt), %eax // w = *next_full_patt | |
302 | shrl $10, %edx // w>>10 | |
303 | addq $4, next_full_patt // next_full_patt++ | |
304 | movzbl %dl, %edx // 8-bit hash table index | |
305 | movl %eax, -4(dest_buf) // *dest_buf = word | |
306 | movzbl (hash,%rdx),%edx // qpos | |
307 | decl tags_counter // tags_counter-- | |
308 | movl %eax, (%rsp,%rdx) // dictionary[qpos] = word | |
309 | jg L_next // repeat the loop | |
310 | jmp L_done | |
311 | ||
312 | .align 4,0x90 | |
313 | L_ZERO_TAG: | |
314 | decl tags_counter // tags_counter-- | |
315 | movl $0, -4(dest_buf) // *dest_buf = 0 | |
316 | jg L_next // repeat the loop | |
317 | jmp L_done | |
318 | ||
319 |