]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2000-2014 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains arm64 hand optimized implementation of WKdm memory page decompressor. | |
31 | ||
32 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word *scratch, __unused__ unsigned int words); | |
33 | ||
34 | input : | |
35 | src_buf : address of input compressed data buffer | |
36 | dest_buf : address of output decompressed buffer | |
37 | scratch : an 8-k bytes scratch mempro provided by the caller | |
38 | words : this argument is used by the mostly-zero-value decoder | |
39 | ||
40 | output : | |
41 | ||
42 | the input buffer is decompressed and the dest_buf is written with decompressed data. | |
43 | ||
44 | Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress arm64 assembly code WKdmCompress.s | |
45 | ||
46 | The bit stream (*src_buf) consists of | |
47 | a. 12 bytes header | |
48 | b. 256 bytes for 1024 packed tags | |
49 | c. (varying number of) words for new words not matched to dictionary word. | |
50 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
51 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
52 | ||
53 | where the header (of 3 words) specifies the ending boundaries (in 32-bit words) of the bit stream of c,d,e, respectively. | |
54 | ||
55 | The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows | |
56 | ||
57 | for (i=0;i<1024;i++) { | |
58 | tag = *next_tag++ | |
59 | switch (tag) { | |
60 | case 0 : *dest_buf++ = 0; break; | |
61 | case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; | |
62 | case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; | |
63 | case 3 : *dest_buf++ = dictionary[*dict_index++]; break; | |
64 | } | |
65 | ||
66 | cclee, Nov 9, '12 | |
67 | ||
68 | Added zero page, single value page, sparse page, early abort optimizations | |
69 | rsrini, 09/14/14 | |
70 | */ | |
71 | ||
72 | #define PAGES_SIZE_IN_KBYTES 16 | |
73 | #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding | |
74 | ||
75 | #ifndef PAGES_SIZE_IN_KBYTES | |
76 | #define PAGES_SIZE_IN_KBYTES 4 | |
77 | #endif | |
78 | ||
79 | #if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16)) | |
80 | #error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported" | |
81 | #endif | |
82 | ||
83 | #define scale (PAGES_SIZE_IN_KBYTES/4) | |
84 | ||
85 | ||
86 | .align 4 | |
87 | .text | |
88 | ||
89 | /* | |
90 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes); | |
91 | */ | |
92 | ||
93 | .globl _WKdm_decompress_16k | |
94 | _WKdm_decompress_16k: | |
95 | ||
96 | /* | |
97 | -------- symbolizing registers -------- | |
98 | the arm64 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names. | |
99 | */ | |
100 | ||
101 | #define src_buf x0 | |
102 | #define dest_buf x1 | |
103 | #define scratch x2 | |
104 | #define n_bytes x3 | |
105 | #define dictionary sp | |
106 | #define rax x13 | |
107 | #define eax w13 | |
108 | #define rbx x4 | |
109 | #define ebx w4 | |
110 | #define rcx x5 | |
111 | #define ecx w5 | |
112 | #define rdx x6 | |
113 | #define edx w6 | |
114 | #define tags_counter x7 | |
115 | #define next_tag x12 | |
116 | #define r8 x8 | |
117 | #define r9 x9 | |
118 | #define r10 x10 | |
119 | #define r11 x11 | |
120 | #define r12 x12 | |
121 | ||
122 | /* | |
123 | ||
124 | ------ scratch memory for local variables --------- | |
125 | ||
126 | [sp,#0] : dictionary | |
127 | [scratch,#0] : tempTagsArray | |
128 | [scratch,#1024] : tempQPosArray | |
129 | [scratch,#2048] : tempLowBitsArray | |
130 | ||
131 | */ | |
132 | ||
133 | #if KERNEL | |
134 | sub rax, sp, #96 | |
135 | sub sp, sp, #96 | |
136 | st1.4s {v0,v1,v2},[rax],#48 | |
137 | st1.4s {v3,v4,v5},[rax],#48 | |
138 | #endif | |
139 | ||
140 | sub sp, sp, #64 | |
141 | ||
142 | ldr eax, [src_buf] // read the 1st word from the header | |
143 | mov ecx, #MZV_MAGIC | |
144 | cmp eax, ecx // is the alternate packer used (i.e. is MZV page)? | |
145 | b.ne L_default_decompressor // default decompressor was used | |
146 | ||
147 | // Mostly Zero Page Handling... | |
148 | // { | |
149 | add src_buf, src_buf, 4 // skip the header | |
150 | mov rax, dest_buf | |
151 | mov rcx, #(PAGES_SIZE_IN_KBYTES*1024) // number of bytes to zero out | |
152 | 1: | |
153 | dc zva, rax // zero 64 bytes. since dest_buf is a page, it will be 4096 or 16384 byte aligned | |
154 | add rax, rax, #64 | |
155 | dc zva, rax | |
156 | add rax, rax, #64 | |
157 | dc zva, rax | |
158 | add rax, rax, #64 | |
159 | dc zva, rax | |
160 | add rax, rax, #64 | |
161 | subs rcx, rcx, #256 | |
162 | b.ne 1b | |
163 | ||
164 | mov r12, #4 // current byte position in src to read from | |
165 | mov rdx, #0 | |
166 | 2: | |
167 | ldr eax, [src_buf], #4 // get the word | |
168 | ldrh edx, [src_buf], #2 // get the index | |
169 | str eax, [dest_buf, rdx] // store non-0 word in the destination buffer | |
170 | add r12, r12, #6 // 6 more bytes processed | |
171 | cmp r12, n_bytes // finished processing all the bytes? | |
172 | b.ne 2b | |
173 | b L_done | |
174 | // } | |
175 | ||
176 | L_default_decompressor: | |
177 | ||
178 | /* | |
179 | ---------------------- set up registers and PRELOAD_DICTIONARY --------------------------------- | |
180 | */ | |
181 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR | |
182 | adrp rbx, _table_2bits@GOTPAGE | |
183 | stp xzr, xzr, [dictionary, #0] | |
184 | add r10, src_buf, #(12+256*scale) // TAGS_AREA_END | |
185 | stp xzr, xzr, [dictionary, #16] | |
186 | add rax, src_buf, #12 // TAGS_AREA_START | |
187 | ldr rbx, [rbx, _table_2bits@GOTPAGEOFF] | |
188 | stp xzr, xzr, [dictionary, #32] | |
189 | mov rcx, scratch // tempTagsArray | |
190 | stp xzr, xzr, [dictionary, #48] | |
191 | ld1.4s {v0,v1},[rbx] | |
192 | ||
193 | ||
194 | /* | |
195 | ------------------------------ unpacking bit stream ---------------------------------- | |
196 | */ | |
197 | ||
198 | // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); | |
199 | /* | |
200 | unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes | |
201 | for arm64, this can be done by | |
202 | 1. read the input 32-bit word into GPR w | |
203 | 2. duplicate GPR into 4 elements in a vector register v0 | |
204 | 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6} | |
205 | 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303 | |
206 | */ | |
207 | ||
208 | L_WK_unpack_2bits: | |
209 | ldr q5, [rax], #16 // read 4 32-bit words for 64 2-bit tags | |
210 | dup.4s v2, v5[0] // duplicate to 4 elements | |
211 | dup.4s v3, v5[1] // duplicate to 4 elements | |
212 | dup.4s v4, v5[2] // duplicate to 4 elements | |
213 | dup.4s v5, v5[3] // duplicate to 4 elements | |
214 | ushl.4s v2, v2, v0 // v1 = {0, -2, -4, -6} | |
215 | ushl.4s v3, v3, v0 // v1 = {0, -2, -4, -6} | |
216 | ushl.4s v4, v4, v0 // v1 = {0, -2, -4, -6} | |
217 | ushl.4s v5, v5, v0 // v1 = {0, -2, -4, -6} | |
218 | and.16b v2, v2, v1 // v2 = {3,3,...,3} | |
219 | and.16b v3, v3, v1 // v2 = {3,3,...,3} | |
220 | and.16b v4, v4, v1 // v2 = {3,3,...,3} | |
221 | and.16b v5, v5, v1 // v2 = {3,3,...,3} | |
222 | cmp r10, rax // TAGS_AREA_END vs TAGS_AREA_START | |
223 | st1.4s {v2,v3,v4,v5}, [rcx], #64 // write 64 tags into tempTagsArray | |
224 | b.hi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits | |
225 | ||
226 | ||
227 | // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); | |
228 | ||
229 | ldp w8, w9, [src_buf] // WKdm header qpos start and end | |
230 | adrp rbx, _table_4bits@GOTPAGE | |
231 | subs x14, r9, r8 // x14 = (QPOS_AREA_END - QPOS_AREA_START)/4 | |
232 | add r8, src_buf, r8, lsl #2 // QPOS_AREA_START | |
233 | add r9, src_buf, r9, lsl #2 // QPOS_AREA_END | |
234 | ||
235 | b.ls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits | |
236 | ldr rbx, [rbx, _table_4bits@GOTPAGEOFF] | |
237 | add rcx, scratch, #(1024*scale) // tempQPosArray | |
238 | ld1.4s {v0,v1},[rbx] | |
239 | subs w14, w14, #1 | |
240 | b.ls 2f // do loop of 2 only if w14 >= 5 | |
241 | L_WK_unpack_4bits: | |
242 | ldr d2, [r8], #8 // read a 32-bit word for 8 4-bit positions | |
243 | subs w14, w14, #2 | |
244 | zip1.4s v2, v2, v2 | |
245 | ushl.4s v2, v2, v0 // v1 = {0, -4, 0, -4} | |
246 | and.16b v2, v2, v1 // v2 = {15,15,...,15} | |
247 | str q2, [rcx], #16 | |
248 | b.hi L_WK_unpack_4bits | |
249 | 2: | |
250 | adds w14, w14, #1 | |
251 | b.le 1f | |
252 | ||
253 | ldr s3, [r8], #4 // read a 32-bit word for 8 4-bit positions | |
254 | dup.2s v2, v3[0] // duplicate to 2 elements | |
255 | ushl.2s v2, v2, v0 // v1 = {0, -4} | |
256 | and.8b v2, v2, v1 // v2 = {15,15,...,15} | |
257 | str d2, [rcx], #8 // write 16 tags into tempTagsArray | |
258 | ||
259 | 1: | |
260 | ||
261 | // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); | |
262 | ||
263 | ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset | |
264 | add r8, src_buf, rax, lsl #2 // LOW_BITS_AREA_END | |
265 | add rcx, scratch, #(2048*scale) // tempLowBitsArray | |
266 | #if (scale==1) | |
267 | add r11, scratch, #(4096*scale-2) // final tenbits for the rare case | |
268 | #else | |
269 | add r11, scratch, #(4096*scale) // final tenbits for the rare case | |
270 | sub r11, r11, #2 | |
271 | #endif | |
272 | subs r8, r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END | |
273 | b.ls 1f // if START>=END, skip L_WK_unpack_3_tenbits | |
274 | ||
275 | adrp rbx, _table_10bits@GOTPAGE | |
276 | ldr rbx, [rbx, _table_10bits@GOTPAGEOFF] | |
277 | ld1.4s {v0,v1,v2,v3},[rbx] | |
278 | ||
279 | /* | |
280 | a very rare case : 1024 tenbits, 1023 + 1 -> 341 + final 1 that is padded with 2 zeros | |
281 | since the scratch memory is 4k (2k for this section), we need to pay attention to the last case | |
282 | so we don't overwrite to the scratch memory | |
283 | ||
284 | we 1st do a single 3_tenbits, followed by 2x_3_tenbits loop, and detect whether the last 3_tenbits | |
285 | hits the raee case | |
286 | */ | |
287 | #if 1 | |
288 | subs r8, r8, #4 // pre-decrement by 8 | |
289 | ldr s4, [r9], #4 // read 32-bit words for 3 low 10-bits | |
290 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. | |
291 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. | |
292 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. | |
293 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. | |
294 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets | |
295 | orr.16b v4, v4, v5 // combine data | |
296 | str d4, [rcx], #6 // write 3 low 10-bits | |
297 | b.eq 1f | |
298 | #endif | |
299 | ||
300 | subs r8, r8, #8 // pre-decrement by 8 | |
301 | b.lt L_WK_unpack_3_tenbits | |
302 | ||
303 | L_WK_unpack_2x_3_tenbits: | |
304 | ldr d4, [r9], #8 // read 2 32-bit words for a pair of 3 low 10-bits | |
305 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. | |
306 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. | |
307 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. | |
308 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. | |
309 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets | |
310 | orr.16b v4, v4, v5 // combine data | |
311 | ins v5.d[0], v4.d[1] | |
312 | str d4, [rcx], #6 // write 3 low 10-bits | |
313 | str d5, [rcx], #6 // write 3 low 10-bits | |
314 | ||
315 | subs r8, r8, #8 | |
316 | b.ge L_WK_unpack_2x_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word | |
317 | ||
318 | tst r8, #4 | |
319 | b.eq 1f | |
320 | ||
321 | L_WK_unpack_3_tenbits: | |
322 | ldr s4, [r9] // read 32-bit words for 3 low 10-bits | |
323 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. | |
324 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. | |
325 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. | |
326 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. | |
327 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets | |
328 | orr.16b v4, v4, v5 // combine data | |
329 | #if 0 | |
330 | str d4, [rcx] // write 3 low 10-bits | |
331 | #else | |
332 | cmp rcx, r11 | |
333 | b.eq 2f | |
334 | str d4, [rcx] // write 3 low 10-bits | |
335 | b 1f | |
336 | 2: | |
337 | str h4, [rcx] // write final 1 low 10-bits | |
338 | #endif | |
339 | 1: | |
340 | ||
341 | /* | |
342 | set up before going to the main decompress loop | |
343 | */ | |
344 | mov next_tag, scratch // tempTagsArray | |
345 | add r8, scratch, #(1024*scale) // next_qpos | |
346 | add r11, scratch, #(2048*scale) // tempLowBitsArray | |
347 | adrp rbx, _hashLookupTable@GOTPAGE | |
348 | mov tags_counter, #(1024*scale) // tag_area_end | |
349 | ldr rbx, [rbx, _hashLookupTable@GOTPAGEOFF] | |
350 | ||
351 | b L_next | |
352 | ||
353 | .align 4,0x90 | |
354 | L_ZERO_TAG: | |
355 | /* | |
356 | we can only get here if w9 = 0, meaning this is a zero tag | |
357 | *dest_buf++ = 0; | |
358 | */ | |
359 | str w9, [dest_buf], #4 // *dest_buf++ = 0 | |
360 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end | |
361 | b.ls L_done // if next_tag >= tag_area_end, we're done | |
362 | ||
363 | /* WKdm decompress main loop */ | |
364 | L_next: | |
365 | ldrb w9, [next_tag], #1 // new tag | |
366 | cbz w9, L_ZERO_TAG | |
367 | cmp w9, #2 // partial match tag ? | |
368 | b.eq L_MISS_TAG | |
369 | b.gt L_EXACT_TAG | |
370 | ||
371 | L_PARTIAL_TAG: | |
372 | /* | |
373 | this is a partial match: | |
374 | dict_word = dictionary[*dict_index]; | |
375 | dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; | |
376 | */ | |
377 | ||
378 | ldrb edx, [r8], #1 // qpos = *next_qpos++ | |
379 | ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++ | |
380 | ldr eax, [dictionary, rdx, lsl #2] // read dictionary word | |
381 | bfm eax, ecx, #0, #9 // pad the lower 10-bits from *next_low_bits | |
382 | str eax, [dictionary,rdx,lsl #2] // *dict_location = newly formed word | |
383 | str eax, [dest_buf], #4 // *dest_buf++ = newly formed word | |
384 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end | |
385 | b.gt L_next // repeat loop until next_tag==tag_area_end | |
386 | ||
387 | L_done: | |
388 | ||
389 | // release stack memory, restore registers, and return | |
390 | add sp, sp, #64 // deallocate for dictionary | |
391 | #if KERNEL | |
392 | ld1.4s {v0,v1,v2},[sp],#48 | |
393 | ld1.4s {v3,v4,v5},[sp],#48 | |
394 | #endif | |
395 | ret lr | |
396 | ||
397 | .align 4,0x90 | |
398 | L_MISS_TAG: | |
399 | /* | |
400 | this is a dictionary miss. | |
401 | x = *new_word++; | |
402 | k = (x>>10)&255; | |
403 | k = hashTable[k]; | |
404 | dictionary[k] = *dest_buf++ = x; | |
405 | */ | |
406 | ldr eax, [r10], #4 // w = *next_full_patt++ | |
407 | ubfm edx, eax, #10, #17 // 8-bit hash table index | |
408 | str eax, [dest_buf], #4 // *dest_buf++ = word | |
409 | ldrb edx, [rbx, rdx] // qpos | |
410 | str eax, [dictionary,rdx] // dictionary[qpos] = word | |
411 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end | |
412 | b.gt L_next // repeat the loop | |
413 | b L_done // if next_tag >= tag_area_end, we're done | |
414 | ||
415 | .align 4,0x90 | |
416 | L_EXACT_TAG: | |
417 | /* | |
418 | this is an exact match; | |
419 | *dest_buf++ = dictionary[*dict_index++]; | |
420 | */ | |
421 | ldrb eax, [r8], #1 // qpos = *next_qpos++ | |
422 | ldr eax, [dictionary,rax,lsl #2] // w = dictionary[qpos] | |
423 | str eax, [dest_buf], #4 // *dest_buf++ = w | |
424 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end | |
425 | b.gt L_next // repeat the loop | |
426 | b L_done // if next_tag >= tag_area_end, we're done | |
427 | ||
428 |