]>
Commit | Line | Data |
---|---|---|
5ba3f43e A |
1 | /* |
2 | * Copyright (c) 2000-2013 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains armv7 hand optimized implementation of WKdm memory page decompressor. | |
31 | ||
32 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); | |
33 | ||
34 | input : | |
35 | src_buf : address of input compressed data buffer | |
36 | dest_buf : address of output decompressed buffer | |
37 | scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller | |
38 | words : this argument is not used in the implementation | |
39 | ||
40 | output : | |
41 | ||
42 | the input buffer is decompressed and the dest_buf is written with decompressed data. | |
43 | ||
44 | Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress armv7 assembly code WKdmCompress.s | |
45 | ||
46 | The bit stream (*src_buf) consists of | |
47 | a. 12 bytes header | |
48 | b. 256 bytes for 1024 packed tags | |
49 | c. (varying number of) words for new words not matched to dictionary word. | |
50 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
51 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
52 | ||
53 | where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. | |
54 | ||
55 | The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows | |
56 | ||
57 | for (i=0;i<1024;i++) { | |
58 | tag = *next_tag++ | |
59 | switch (tag) { | |
60 | case 0 : *dest_buf++ = 0; break; | |
61 | case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; | |
62 | case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; | |
63 | case 3 : *dest_buf++ = dictionary[*dict_index++]; break; | |
64 | } | |
65 | ||
66 | cclee, 11/9/12 | |
67 | ||
68 | Added zero page, single value page, sparse page, early abort optimizations | |
69 | rsrini, 09/14/14 | |
70 | ||
71 | */ | |
72 | ||
73 | #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding | |
74 | ||
75 | #define ZERO 0 | |
76 | #define PARTIAL_MATCH 1 | |
77 | #define MISS_TAG 2 | |
78 | #define MATCH 3 | |
79 | ||
80 | .text | |
81 | .syntax unified | |
82 | .align 4 | |
83 | ||
84 | // void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes); | |
85 | ||
86 | .globl _WKdm_decompress_new | |
87 | _WKdm_decompress_new: | |
88 | ||
89 | /* | |
90 | -------- symbolizing registers -------- | |
91 | the armv7 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names. | |
92 | */ | |
93 | ||
94 | #define src_buf r0 | |
95 | #define dest_buf r1 | |
96 | #define scratch r2 | |
97 | #define eax r3 | |
98 | #define ebx r4 | |
99 | #define hashTable r4 | |
100 | #define ecx r5 | |
101 | #define edx r6 | |
102 | #define n_bytes r8 | |
103 | #define next_tag r12 | |
104 | #define tags_counter lr | |
105 | #define dictionary sp | |
106 | #define v0 q0 | |
107 | #define v1 q1 | |
108 | #define v2 q2 | |
109 | #define v3 q3 | |
110 | #define v4 q4 | |
111 | #define v5 q5 | |
112 | ||
113 | // and scratch memory for local variables | |
114 | ||
115 | // [sp,#0] : dictionary | |
116 | // [scratch,#0] : tempTagsArray was 64 | |
117 | // [scratch,#1024] : tempQPosArray was 1088 | |
118 | // [scratch,#2048] : tempLowBitsArray was 2112 | |
119 | ||
120 | push {r7, lr} | |
121 | mov r7, sp | |
122 | push {r4-r6,r8-r11} | |
123 | #if KERNEL | |
124 | sub ecx, sp, #96 | |
125 | sub sp, sp, #96 | |
126 | vst1.64 {q0,q1},[ecx]! | |
127 | vst1.64 {q2,q3},[ecx]! | |
128 | vst1.64 {q4,q5},[ecx]! | |
129 | #endif | |
130 | sub sp, sp, #64 // allocate for dictionary | |
131 | ||
132 | mov n_bytes, r3 // save the n_bytes passed as function args | |
133 | ldr eax, [src_buf] // read the 1st word from the header | |
134 | mov ecx, #MZV_MAGIC | |
135 | cmp eax, ecx // is the alternate packer used (i.e. is MZV page)? | |
136 | bne L_default_decompressor // default decompressor was used | |
137 | ||
138 | // Mostly Zero Page Handling... | |
139 | // { | |
140 | add src_buf, src_buf, 4 // skip the header | |
141 | mov eax, dest_buf | |
142 | mov ecx, #4096 // number of bytes to zero out | |
143 | mov r9, #0 | |
144 | mov r10, #0 | |
145 | mov r11, #0 | |
146 | mov r12, #0 | |
147 | 1: | |
148 | subs ecx, ecx, #64 | |
149 | stmia eax!, {r9-r12} | |
150 | stmia eax!, {r9-r12} | |
151 | stmia eax!, {r9-r12} | |
152 | stmia eax!, {r9-r12} | |
153 | bne 1b | |
154 | ||
155 | mov r12, #4 // current byte position in src to read from | |
156 | 2: | |
157 | ldr eax, [src_buf], #4 // get the word | |
158 | ldrh edx, [src_buf], #2 // get the index | |
159 | str eax, [dest_buf, edx] // store non-0 word in the destination buffer | |
160 | add r12, r12, #6 // 6 more bytes processed | |
161 | cmp r12, n_bytes // finished processing all the bytes? | |
162 | bne 2b | |
163 | b L_done | |
164 | // } | |
165 | ||
166 | L_default_decompressor: | |
167 | ||
168 | /* | |
169 | ---------------------- set up registers and PRELOAD_DICTIONARY --------------------------------- | |
170 | */ | |
171 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR | |
172 | vmov.i32 q0, #0 | |
173 | mov r8, sp | |
174 | adr ebx, _table_2bits | |
175 | vst1.64 {q0}, [r8]! | |
176 | add r10, src_buf, #268 // TAGS_AREA_END | |
177 | vst1.64 {q0}, [r8]! | |
178 | add eax, src_buf, #12 // TAGS_AREA_START | |
179 | vst1.64 {q0}, [r8]! | |
180 | mov ecx, scratch // tempTagsArray | |
181 | vst1.64 {q0}, [r8]! | |
182 | vld1.64 {q0,q1},[ebx,:128] | |
183 | ||
184 | ||
185 | // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); | |
186 | /* | |
187 | unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes | |
188 | for arm64, this can be done by | |
189 | 1. read the input 32-bit word into GPR w | |
190 | 2. duplicate GPR into 4 elements in a vector register v0 | |
191 | 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6} | |
192 | 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303 | |
193 | */ | |
194 | ||
195 | L_WK_unpack_2bits: | |
196 | vld1.64 {v5}, [eax]! // read 4 32-bit words for 64 2-bit tags | |
197 | vdup.32 v2, d10[0] // duplicate to 4 elements | |
198 | vdup.32 v3, d10[1] // duplicate to 4 elements | |
199 | vdup.32 v4, d11[0] // duplicate to 4 elements | |
200 | vdup.32 v5, d11[1] // duplicate to 4 elements | |
201 | vshl.u32 v2, v2, v0 // v0 = {0, -2, -4, -6} | |
202 | vshl.u32 v3, v3, v0 // v0 = {0, -2, -4, -6} | |
203 | vshl.u32 v4, v4, v0 // v0 = {0, -2, -4, -6} | |
204 | vshl.u32 v5, v5, v0 // v0 = {0, -2, -4, -6} | |
205 | vand v2, v2, v1 // v1 = {3,3,...,3} | |
206 | vand v3, v3, v1 // v1 = {3,3,...,3} | |
207 | vand v4, v4, v1 // v1 = {3,3,...,3} | |
208 | vand v5, v5, v1 // v1 = {3,3,...,3} | |
209 | vst1.64 {v2,v3}, [ecx,:128]! // write 64 tags into tempTagsArray | |
210 | cmp r10, eax // TAGS_AREA_END vs TAGS_AREA_START | |
211 | vst1.64 {v4,v5}, [ecx,:128]! // write 64 tags into tempTagsArray | |
212 | bhi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits | |
213 | ||
214 | ||
215 | // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); | |
216 | ||
217 | ldm src_buf, {r8,r9} // WKdm header qpos start and end | |
218 | adr ebx, _table_4bits | |
219 | subs r12, r9, r8 // r12 = (QPOS_AREA_END - QPOS_AREA_START)/4 | |
220 | add r8, src_buf, r8, lsl #2 // QPOS_AREA_START | |
221 | add r9, src_buf, r9, lsl #2 // QPOS_AREA_END | |
222 | bls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits | |
223 | add ecx, scratch, #1024 // tempQPosArray | |
224 | vld1.64 {v0,v1},[ebx,:128] | |
225 | ||
226 | subs r12, r12, #1 | |
227 | bls 2f // do loop of 2 only if w14 >= 5 | |
228 | L_WK_unpack_4bits: | |
229 | vld1.64 {d4}, [r8]! // read a 32-bit word for 8 4-bit positions | |
230 | subs r12, r12, #2 | |
231 | vmov d5, d4 | |
232 | vzip.32 d4, d5 | |
233 | vshl.u32 v2, v2, v0 // v0 = {0, -4, 0, -4} | |
234 | vand v2, v2, v1 // v1 = {15,15,...,15} | |
235 | vst1.64 {q2}, [ecx,:128]! | |
236 | bhi L_WK_unpack_4bits | |
237 | 2: | |
238 | adds r12, r12, #1 | |
239 | ble 1f | |
240 | ||
241 | ldr r12, [r8], #4 // read a 32-bit word for 8 4-bit positions | |
242 | vdup.32 d4, r12 // duplicate to 2 elements | |
243 | vshl.u32 v2, v2, v0 // v0 = {0, -4} | |
244 | vand v2, v2, v1 // v1 = {15,15,...,15} | |
245 | vst1.64 {d4}, [ecx,:64]! // write 16 tags into tempTagsArray | |
246 | ||
247 | 1: | |
248 | ||
249 | // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); | |
250 | ||
251 | ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset | |
252 | add r8, src_buf, eax, lsl #2 // LOW_BITS_AREA_END | |
253 | cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END | |
254 | add ecx, scratch, #2048 // tempLowBitsArray | |
255 | add edx, scratch, #4096 // last tenbits | |
256 | bls 1f // if START>=END, skip L_WK_unpack_3_tenbits | |
257 | ||
258 | adr ebx, _table_10bits | |
259 | vld1.64 {v0,v1},[ebx,:128] | |
260 | ||
261 | mov r11, #0x03ff | |
262 | L_WK_unpack_3_tenbits: | |
263 | ldr r12, [r9], #4 // read a 32-bit word for 3 low 10-bits | |
264 | and lr, r11, r12 | |
265 | strh lr, [ecx], #2 | |
266 | cmp ecx, edx | |
267 | and lr, r11, r12, lsr #10 | |
268 | beq 1f | |
269 | strh lr, [ecx], #2 | |
270 | and lr, r11, r12, lsr #20 | |
271 | strh lr, [ecx], #2 | |
272 | ||
273 | cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END | |
274 | bhi L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word | |
275 | ||
276 | 1: | |
277 | /* | |
278 | set up before going to the main decompress loop | |
279 | */ | |
280 | ||
281 | mov next_tag, scratch // tempTagsArray | |
282 | add r8, scratch, #1024 // next_qpos | |
283 | add r11, scratch, #2048 // tempLowBitsArray | |
284 | #if defined(KERNEL) && !SLIDABLE | |
285 | adr hashTable, L_table | |
286 | ldr hashTable, [hashTable] | |
287 | #else | |
288 | ldr hashTable, L_table | |
289 | L_table0: | |
290 | ldr hashTable, [pc, hashTable] | |
291 | #endif | |
292 | mov tags_counter, #1024 // tags_counter | |
293 | ||
294 | b L_next | |
295 | ||
296 | .align 4,0x90 | |
297 | L_ZERO_TAG: | |
298 | /* | |
299 | we can only get here if w9 = 0, meaning this is a zero tag | |
300 | *dest_buf++ = 0; | |
301 | */ | |
302 | subs tags_counter,tags_counter,#1 // tags_counter-- | |
303 | str r9, [dest_buf], #4 // *dest_buf++ = 0 | |
304 | ble L_done // if next_tag >= tag_area_end, we're done | |
305 | ||
306 | /* WKdm decompress main loop */ | |
307 | L_next: | |
308 | ldrb r9, [next_tag], #1 // new tag | |
309 | cmp r9, #0 | |
310 | beq L_ZERO_TAG | |
311 | cmp r9, #2 // partial match tag ? | |
312 | beq L_MISS_TAG | |
313 | bgt L_EXACT_TAG | |
314 | ||
315 | L_PARTIAL_TAG: | |
316 | /* | |
317 | this is a partial match: | |
318 | dict_word = dictionary[*dict_index]; | |
319 | dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; | |
320 | */ | |
321 | ldrb edx, [r8], #1 // qpos = *next_qpos++ | |
322 | ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++ | |
323 | ldr eax, [dictionary, edx, lsl #2] // read dictionary word | |
324 | subs tags_counter,tags_counter,#1 // tags_counter-- | |
325 | lsr eax, eax, #10 // clear lower 10 bits | |
326 | orr eax, ecx, eax, lsl #10 // pad the lower 10-bits from *next_low_bits | |
327 | str eax, [dictionary,edx,lsl #2] // *dict_location = newly formed word | |
328 | str eax, [dest_buf], #4 // *dest_buf++ = newly formed word | |
329 | bgt L_next // repeat loop until next_tag==tag_area_end | |
330 | ||
331 | L_done: | |
332 | ||
333 | add sp, sp, #64 // deallocate for dictionary | |
334 | ||
335 | // release stack memory, restore registers, and return | |
336 | #if KERNEL | |
337 | vld1.64 {q0,q1},[sp]! | |
338 | vld1.64 {q2,q3},[sp]! | |
339 | vld1.64 {q4,q5},[sp]! | |
340 | #endif | |
341 | pop {r4-r6,r8-r11} | |
342 | pop {r7,pc} | |
343 | ||
344 | .align 4,0x90 | |
345 | L_MISS_TAG: | |
346 | /* | |
347 | this is a dictionary miss. | |
348 | x = *new_word++; | |
349 | k = (x>>10)&255; | |
350 | k = hashTable[k]; | |
351 | dictionary[k] = *dest_buf++ = x; | |
352 | */ | |
353 | subs tags_counter,tags_counter,#1 // tags_counter-- | |
354 | ldr eax, [r10], #4 // w = *next_full_patt++ | |
355 | lsr edx, eax, #10 // w>>10 | |
356 | str eax, [dest_buf], #4 // *dest_buf++ = word | |
357 | and edx, edx, #0x0ff // 8-bit hash table index | |
358 | ldrb edx, [ebx, edx] // qpos | |
359 | str eax, [dictionary,edx] // dictionary[qpos] = word | |
360 | bgt L_next // repeat the loop | |
361 | b L_done // if next_tag >= tag_area_end, we're done | |
362 | ||
363 | .align 4,0x90 | |
364 | ||
365 | L_EXACT_TAG: | |
366 | /* | |
367 | this is an exact match; | |
368 | *dest_buf++ = dictionary[*dict_index++]; | |
369 | */ | |
370 | ||
371 | ldrb eax, [r8], #1 // qpos = *next_qpos++ | |
372 | subs tags_counter,tags_counter,#1 // tags_counter-- | |
373 | ldr eax, [dictionary,eax,lsl #2] // w = dictionary[qpos] | |
374 | str eax, [dest_buf], #4 // *dest_buf++ = w | |
375 | bgt L_next // repeat the loop | |
376 | b L_done // if next_tag >= tag_area_end, we're done | |
377 | ||
378 | ||
379 | .align 4 | |
380 | ||
381 | _table_2bits: | |
382 | .word 0 | |
383 | .word -2 | |
384 | .word -4 | |
385 | .word -6 | |
386 | .word 0x03030303 | |
387 | .word 0x03030303 | |
388 | .word 0x03030303 | |
389 | .word 0x03030303 | |
390 | ||
391 | _table_4bits: | |
392 | .word 0 | |
393 | .word -4 | |
394 | .word 0 | |
395 | .word -4 | |
396 | .word 0x0f0f0f0f | |
397 | .word 0x0f0f0f0f | |
398 | .word 0x0f0f0f0f | |
399 | .word 0x0f0f0f0f | |
400 | ||
401 | _table_10bits: | |
402 | .word 0 | |
403 | .word -10 | |
404 | .word -20 | |
405 | .word 0 | |
406 | .word 1023 | |
407 | .word 1023 | |
408 | .word 1023 | |
409 | .word 0 | |
410 | ||
411 | ||
412 | #if defined(KERNEL) && !SLIDABLE | |
413 | .align 2 | |
414 | L_table: | |
415 | .long _hashLookupTable_new | |
416 | #else | |
417 | .align 2 | |
418 | L_table: | |
419 | .long L_Tab$non_lazy_ptr-(L_table0+8) | |
420 | ||
421 | .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers | |
422 | .align 2 | |
423 | L_Tab$non_lazy_ptr: | |
424 | .indirect_symbol _hashLookupTable_new | |
425 | .long 0 | |
426 | #endif | |
427 |