]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm/WKdmDecompress_new.s
xnu-4570.71.2.tar.gz
[apple/xnu.git] / osfmk / arm / WKdmDecompress_new.s
1 /*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 This file contains armv7 hand optimized implementation of WKdm memory page decompressor.
31
32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
33
34 input :
35 src_buf : address of input compressed data buffer
36 dest_buf : address of output decompressed buffer
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38 words : this argument is not used in the implementation
39
40 output :
41
42 the input buffer is decompressed and the dest_buf is written with decompressed data.
43
44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress armv7 assembly code WKdmCompress.s
45
46 The bit stream (*src_buf) consists of
47 a. 12 bytes header
48 b. 256 bytes for 1024 packed tags
49 c. (varying number of) words for new words not matched to dictionary word.
50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
52
53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
54
55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
56
57 for (i=0;i<1024;i++) {
58 tag = *next_tag++
59 switch (tag) {
60 case 0 : *dest_buf++ = 0; break;
61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break;
64 }
65
66 cclee, 11/9/12
67
68 Added zero page, single value page, sparse page, early abort optimizations
69 rsrini, 09/14/14
70
71 */
72
73 #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
74
75 #define ZERO 0
76 #define PARTIAL_MATCH 1
77 #define MISS_TAG 2
78 #define MATCH 3
79
80 .text
81 .syntax unified
82 .align 4
83
84 // void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes);
85
86 .globl _WKdm_decompress_new
87 _WKdm_decompress_new:
88
89 /*
90 -------- symbolizing registers --------
91 the armv7 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names.
92 */
93
94 #define src_buf r0
95 #define dest_buf r1
96 #define scratch r2
97 #define eax r3
98 #define ebx r4
99 #define hashTable r4
100 #define ecx r5
101 #define edx r6
102 #define n_bytes r8
103 #define next_tag r12
104 #define tags_counter lr
105 #define dictionary sp
106 #define v0 q0
107 #define v1 q1
108 #define v2 q2
109 #define v3 q3
110 #define v4 q4
111 #define v5 q5
112
113 // and scratch memory for local variables
114
115 // [sp,#0] : dictionary
116 // [scratch,#0] : tempTagsArray was 64
117 // [scratch,#1024] : tempQPosArray was 1088
118 // [scratch,#2048] : tempLowBitsArray was 2112
119
120 push {r7, lr}
121 mov r7, sp
122 push {r4-r6,r8-r11}
123 #if KERNEL
124 sub ecx, sp, #96
125 sub sp, sp, #96
126 vst1.64 {q0,q1},[ecx]!
127 vst1.64 {q2,q3},[ecx]!
128 vst1.64 {q4,q5},[ecx]!
129 #endif
130 sub sp, sp, #64 // allocate for dictionary
131
132 mov n_bytes, r3 // save the n_bytes passed as function args
133 ldr eax, [src_buf] // read the 1st word from the header
134 mov ecx, #MZV_MAGIC
135 cmp eax, ecx // is the alternate packer used (i.e. is MZV page)?
136 bne L_default_decompressor // default decompressor was used
137
138 // Mostly Zero Page Handling...
139 // {
140 add src_buf, src_buf, 4 // skip the header
141 mov eax, dest_buf
142 mov ecx, #4096 // number of bytes to zero out
143 mov r9, #0
144 mov r10, #0
145 mov r11, #0
146 mov r12, #0
147 1:
148 subs ecx, ecx, #64
149 stmia eax!, {r9-r12}
150 stmia eax!, {r9-r12}
151 stmia eax!, {r9-r12}
152 stmia eax!, {r9-r12}
153 bne 1b
154
155 mov r12, #4 // current byte position in src to read from
156 2:
157 ldr eax, [src_buf], #4 // get the word
158 ldrh edx, [src_buf], #2 // get the index
159 str eax, [dest_buf, edx] // store non-0 word in the destination buffer
160 add r12, r12, #6 // 6 more bytes processed
161 cmp r12, n_bytes // finished processing all the bytes?
162 bne 2b
163 b L_done
164 // }
165
166 L_default_decompressor:
167
168 /*
169 ---------------------- set up registers and PRELOAD_DICTIONARY ---------------------------------
170 */
171 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
172 vmov.i32 q0, #0
173 mov r8, sp
174 adr ebx, _table_2bits
175 vst1.64 {q0}, [r8]!
176 add r10, src_buf, #268 // TAGS_AREA_END
177 vst1.64 {q0}, [r8]!
178 add eax, src_buf, #12 // TAGS_AREA_START
179 vst1.64 {q0}, [r8]!
180 mov ecx, scratch // tempTagsArray
181 vst1.64 {q0}, [r8]!
182 vld1.64 {q0,q1},[ebx,:128]
183
184
185 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
186 /*
187 unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes
188 for arm64, this can be done by
189 1. read the input 32-bit word into GPR w
190 2. duplicate GPR into 4 elements in a vector register v0
191 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6}
192 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303
193 */
194
195 L_WK_unpack_2bits:
196 vld1.64 {v5}, [eax]! // read 4 32-bit words for 64 2-bit tags
197 vdup.32 v2, d10[0] // duplicate to 4 elements
198 vdup.32 v3, d10[1] // duplicate to 4 elements
199 vdup.32 v4, d11[0] // duplicate to 4 elements
200 vdup.32 v5, d11[1] // duplicate to 4 elements
201 vshl.u32 v2, v2, v0 // v0 = {0, -2, -4, -6}
202 vshl.u32 v3, v3, v0 // v0 = {0, -2, -4, -6}
203 vshl.u32 v4, v4, v0 // v0 = {0, -2, -4, -6}
204 vshl.u32 v5, v5, v0 // v0 = {0, -2, -4, -6}
205 vand v2, v2, v1 // v1 = {3,3,...,3}
206 vand v3, v3, v1 // v1 = {3,3,...,3}
207 vand v4, v4, v1 // v1 = {3,3,...,3}
208 vand v5, v5, v1 // v1 = {3,3,...,3}
209 vst1.64 {v2,v3}, [ecx,:128]! // write 64 tags into tempTagsArray
210 cmp r10, eax // TAGS_AREA_END vs TAGS_AREA_START
211 vst1.64 {v4,v5}, [ecx,:128]! // write 64 tags into tempTagsArray
212 bhi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits
213
214
215 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
216
217 ldm src_buf, {r8,r9} // WKdm header qpos start and end
218 adr ebx, _table_4bits
219 subs r12, r9, r8 // r12 = (QPOS_AREA_END - QPOS_AREA_START)/4
220 add r8, src_buf, r8, lsl #2 // QPOS_AREA_START
221 add r9, src_buf, r9, lsl #2 // QPOS_AREA_END
222 bls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
223 add ecx, scratch, #1024 // tempQPosArray
224 vld1.64 {v0,v1},[ebx,:128]
225
226 subs r12, r12, #1
227 bls 2f // do loop of 2 only if w14 >= 5
228 L_WK_unpack_4bits:
229 vld1.64 {d4}, [r8]! // read a 32-bit word for 8 4-bit positions
230 subs r12, r12, #2
231 vmov d5, d4
232 vzip.32 d4, d5
233 vshl.u32 v2, v2, v0 // v0 = {0, -4, 0, -4}
234 vand v2, v2, v1 // v1 = {15,15,...,15}
235 vst1.64 {q2}, [ecx,:128]!
236 bhi L_WK_unpack_4bits
237 2:
238 adds r12, r12, #1
239 ble 1f
240
241 ldr r12, [r8], #4 // read a 32-bit word for 8 4-bit positions
242 vdup.32 d4, r12 // duplicate to 2 elements
243 vshl.u32 v2, v2, v0 // v0 = {0, -4}
244 vand v2, v2, v1 // v1 = {15,15,...,15}
245 vst1.64 {d4}, [ecx,:64]! // write 16 tags into tempTagsArray
246
247 1:
248
249 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
250
251 ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset
252 add r8, src_buf, eax, lsl #2 // LOW_BITS_AREA_END
253 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
254 add ecx, scratch, #2048 // tempLowBitsArray
255 add edx, scratch, #4096 // last tenbits
256 bls 1f // if START>=END, skip L_WK_unpack_3_tenbits
257
258 adr ebx, _table_10bits
259 vld1.64 {v0,v1},[ebx,:128]
260
261 mov r11, #0x03ff
262 L_WK_unpack_3_tenbits:
263 ldr r12, [r9], #4 // read a 32-bit word for 3 low 10-bits
264 and lr, r11, r12
265 strh lr, [ecx], #2
266 cmp ecx, edx
267 and lr, r11, r12, lsr #10
268 beq 1f
269 strh lr, [ecx], #2
270 and lr, r11, r12, lsr #20
271 strh lr, [ecx], #2
272
273 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
274 bhi L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
275
276 1:
277 /*
278 set up before going to the main decompress loop
279 */
280
281 mov next_tag, scratch // tempTagsArray
282 add r8, scratch, #1024 // next_qpos
283 add r11, scratch, #2048 // tempLowBitsArray
284 #if defined(KERNEL) && !SLIDABLE
285 adr hashTable, L_table
286 ldr hashTable, [hashTable]
287 #else
288 ldr hashTable, L_table
289 L_table0:
290 ldr hashTable, [pc, hashTable]
291 #endif
292 mov tags_counter, #1024 // tags_counter
293
294 b L_next
295
296 .align 4,0x90
297 L_ZERO_TAG:
298 /*
299 we can only get here if w9 = 0, meaning this is a zero tag
300 *dest_buf++ = 0;
301 */
302 subs tags_counter,tags_counter,#1 // tags_counter--
303 str r9, [dest_buf], #4 // *dest_buf++ = 0
304 ble L_done // if next_tag >= tag_area_end, we're done
305
306 /* WKdm decompress main loop */
307 L_next:
308 ldrb r9, [next_tag], #1 // new tag
309 cmp r9, #0
310 beq L_ZERO_TAG
311 cmp r9, #2 // partial match tag ?
312 beq L_MISS_TAG
313 bgt L_EXACT_TAG
314
315 L_PARTIAL_TAG:
316 /*
317 this is a partial match:
318 dict_word = dictionary[*dict_index];
319 dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++;
320 */
321 ldrb edx, [r8], #1 // qpos = *next_qpos++
322 ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++
323 ldr eax, [dictionary, edx, lsl #2] // read dictionary word
324 subs tags_counter,tags_counter,#1 // tags_counter--
325 lsr eax, eax, #10 // clear lower 10 bits
326 orr eax, ecx, eax, lsl #10 // pad the lower 10-bits from *next_low_bits
327 str eax, [dictionary,edx,lsl #2] // *dict_location = newly formed word
328 str eax, [dest_buf], #4 // *dest_buf++ = newly formed word
329 bgt L_next // repeat loop until next_tag==tag_area_end
330
331 L_done:
332
333 add sp, sp, #64 // deallocate for dictionary
334
335 // release stack memory, restore registers, and return
336 #if KERNEL
337 vld1.64 {q0,q1},[sp]!
338 vld1.64 {q2,q3},[sp]!
339 vld1.64 {q4,q5},[sp]!
340 #endif
341 pop {r4-r6,r8-r11}
342 pop {r7,pc}
343
344 .align 4,0x90
345 L_MISS_TAG:
346 /*
347 this is a dictionary miss.
348 x = *new_word++;
349 k = (x>>10)&255;
350 k = hashTable[k];
351 dictionary[k] = *dest_buf++ = x;
352 */
353 subs tags_counter,tags_counter,#1 // tags_counter--
354 ldr eax, [r10], #4 // w = *next_full_patt++
355 lsr edx, eax, #10 // w>>10
356 str eax, [dest_buf], #4 // *dest_buf++ = word
357 and edx, edx, #0x0ff // 8-bit hash table index
358 ldrb edx, [ebx, edx] // qpos
359 str eax, [dictionary,edx] // dictionary[qpos] = word
360 bgt L_next // repeat the loop
361 b L_done // if next_tag >= tag_area_end, we're done
362
363 .align 4,0x90
364
365 L_EXACT_TAG:
366 /*
367 this is an exact match;
368 *dest_buf++ = dictionary[*dict_index++];
369 */
370
371 ldrb eax, [r8], #1 // qpos = *next_qpos++
372 subs tags_counter,tags_counter,#1 // tags_counter--
373 ldr eax, [dictionary,eax,lsl #2] // w = dictionary[qpos]
374 str eax, [dest_buf], #4 // *dest_buf++ = w
375 bgt L_next // repeat the loop
376 b L_done // if next_tag >= tag_area_end, we're done
377
378
379 .align 4
380
381 _table_2bits:
382 .word 0
383 .word -2
384 .word -4
385 .word -6
386 .word 0x03030303
387 .word 0x03030303
388 .word 0x03030303
389 .word 0x03030303
390
391 _table_4bits:
392 .word 0
393 .word -4
394 .word 0
395 .word -4
396 .word 0x0f0f0f0f
397 .word 0x0f0f0f0f
398 .word 0x0f0f0f0f
399 .word 0x0f0f0f0f
400
401 _table_10bits:
402 .word 0
403 .word -10
404 .word -20
405 .word 0
406 .word 1023
407 .word 1023
408 .word 1023
409 .word 0
410
411
412 #if defined(KERNEL) && !SLIDABLE
413 .align 2
414 L_table:
415 .long _hashLookupTable_new
416 #else
417 .align 2
418 L_table:
419 .long L_Tab$non_lazy_ptr-(L_table0+8)
420
421 .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
422 .align 2
423 L_Tab$non_lazy_ptr:
424 .indirect_symbol _hashLookupTable_new
425 .long 0
426 #endif
427