2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
30 This file contains armv7 hand optimized implementation of WKdm memory page decompressor.
32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
35 src_buf : address of input compressed data buffer
36 dest_buf : address of output decompressed buffer
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38 words : this argument is not used in the implementation
42 the input buffer is decompressed and the dest_buf is written with decompressed data.
44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress armv7 assembly code WKdmCompress.s
46 The bit stream (*src_buf) consists of
48 b. 256 bytes for 1024 packed tags
49 c. (varying number of) words for new words not matched to dictionary word.
50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
57 for (i=0;i<1024;i++) {
60 case 0 : *dest_buf++ = 0; break;
61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break;
68 Added zero page, single value page, sparse page, early abort optimizations
73 #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
76 #define PARTIAL_MATCH 1
84 // void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes);
86 .globl _WKdm_decompress_new
90 -------- symbolizing registers --------
91 the armv7 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names.
104 #define tags_counter lr
105 #define dictionary sp
113 // and scratch memory for local variables
115 // [sp,#0] : dictionary
116 // [scratch,#0] : tempTagsArray was 64
117 // [scratch,#1024] : tempQPosArray was 1088
118 // [scratch,#2048] : tempLowBitsArray was 2112
126 vst1.64 {q0,q1},[ecx]!
127 vst1.64 {q2,q3},[ecx]!
128 vst1.64 {q4,q5},[ecx]!
130 sub sp, sp, #64 // allocate for dictionary
132 mov n_bytes, r3 // save the n_bytes passed as function args
133 ldr eax, [src_buf] // read the 1st word from the header
135 cmp eax, ecx // is the alternate packer used (i.e. is MZV page)?
136 bne L_default_decompressor // default decompressor was used
138 // Mostly Zero Page Handling...
140 add src_buf, src_buf, 4 // skip the header
142 mov ecx, #4096 // number of bytes to zero out
155 mov r12, #4 // current byte position in src to read from
157 ldr eax, [src_buf], #4 // get the word
158 ldrh edx, [src_buf], #2 // get the index
159 str eax, [dest_buf, edx] // store non-0 word in the destination buffer
160 add r12, r12, #6 // 6 more bytes processed
161 cmp r12, n_bytes // finished processing all the bytes?
166 L_default_decompressor:
169 ---------------------- set up registers and PRELOAD_DICTIONARY ---------------------------------
171 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
174 adr ebx, _table_2bits
176 add r10, src_buf, #268 // TAGS_AREA_END
178 add eax, src_buf, #12 // TAGS_AREA_START
180 mov ecx, scratch // tempTagsArray
182 vld1.64 {q0,q1},[ebx,:128]
185 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
187 unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes
188 for arm64, this can be done by
189 1. read the input 32-bit word into GPR w
190 2. duplicate GPR into 4 elements in a vector register v0
191 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6}
192 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303
196 vld1.64 {v5}, [eax]! // read 4 32-bit words for 64 2-bit tags
197 vdup.32 v2, d10[0] // duplicate to 4 elements
198 vdup.32 v3, d10[1] // duplicate to 4 elements
199 vdup.32 v4, d11[0] // duplicate to 4 elements
200 vdup.32 v5, d11[1] // duplicate to 4 elements
201 vshl.u32 v2, v2, v0 // v0 = {0, -2, -4, -6}
202 vshl.u32 v3, v3, v0 // v0 = {0, -2, -4, -6}
203 vshl.u32 v4, v4, v0 // v0 = {0, -2, -4, -6}
204 vshl.u32 v5, v5, v0 // v0 = {0, -2, -4, -6}
205 vand v2, v2, v1 // v1 = {3,3,...,3}
206 vand v3, v3, v1 // v1 = {3,3,...,3}
207 vand v4, v4, v1 // v1 = {3,3,...,3}
208 vand v5, v5, v1 // v1 = {3,3,...,3}
209 vst1.64 {v2,v3}, [ecx,:128]! // write 64 tags into tempTagsArray
210 cmp r10, eax // TAGS_AREA_END vs TAGS_AREA_START
211 vst1.64 {v4,v5}, [ecx,:128]! // write 64 tags into tempTagsArray
212 bhi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits
215 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
217 ldm src_buf, {r8,r9} // WKdm header qpos start and end
218 adr ebx, _table_4bits
219 subs r12, r9, r8 // r12 = (QPOS_AREA_END - QPOS_AREA_START)/4
220 add r8, src_buf, r8, lsl #2 // QPOS_AREA_START
221 add r9, src_buf, r9, lsl #2 // QPOS_AREA_END
222 bls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
223 add ecx, scratch, #1024 // tempQPosArray
224 vld1.64 {v0,v1},[ebx,:128]
227 bls 2f // do loop of 2 only if w14 >= 5
229 vld1.64 {d4}, [r8]! // read a 32-bit word for 8 4-bit positions
233 vshl.u32 v2, v2, v0 // v0 = {0, -4, 0, -4}
234 vand v2, v2, v1 // v1 = {15,15,...,15}
235 vst1.64 {q2}, [ecx,:128]!
236 bhi L_WK_unpack_4bits
241 ldr r12, [r8], #4 // read a 32-bit word for 8 4-bit positions
242 vdup.32 d4, r12 // duplicate to 2 elements
243 vshl.u32 v2, v2, v0 // v0 = {0, -4}
244 vand v2, v2, v1 // v1 = {15,15,...,15}
245 vst1.64 {d4}, [ecx,:64]! // write 16 tags into tempTagsArray
249 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
251 ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset
252 add r8, src_buf, eax, lsl #2 // LOW_BITS_AREA_END
253 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
254 add ecx, scratch, #2048 // tempLowBitsArray
255 add edx, scratch, #4096 // last tenbits
256 bls 1f // if START>=END, skip L_WK_unpack_3_tenbits
258 adr ebx, _table_10bits
259 vld1.64 {v0,v1},[ebx,:128]
262 L_WK_unpack_3_tenbits:
263 ldr r12, [r9], #4 // read a 32-bit word for 3 low 10-bits
267 and lr, r11, r12, lsr #10
270 and lr, r11, r12, lsr #20
273 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
274 bhi L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
278 set up before going to the main decompress loop
281 mov next_tag, scratch // tempTagsArray
282 add r8, scratch, #1024 // next_qpos
283 add r11, scratch, #2048 // tempLowBitsArray
284 #if defined(KERNEL) && !SLIDABLE
285 adr hashTable, L_table
286 ldr hashTable, [hashTable]
288 ldr hashTable, L_table
290 ldr hashTable, [pc, hashTable]
292 mov tags_counter, #1024 // tags_counter
299 we can only get here if w9 = 0, meaning this is a zero tag
302 subs tags_counter,tags_counter,#1 // tags_counter--
303 str r9, [dest_buf], #4 // *dest_buf++ = 0
304 ble L_done // if next_tag >= tag_area_end, we're done
306 /* WKdm decompress main loop */
308 ldrb r9, [next_tag], #1 // new tag
311 cmp r9, #2 // partial match tag ?
317 this is a partial match:
318 dict_word = dictionary[*dict_index];
319 dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++;
321 ldrb edx, [r8], #1 // qpos = *next_qpos++
322 ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++
323 ldr eax, [dictionary, edx, lsl #2] // read dictionary word
324 subs tags_counter,tags_counter,#1 // tags_counter--
325 lsr eax, eax, #10 // clear lower 10 bits
326 orr eax, ecx, eax, lsl #10 // pad the lower 10-bits from *next_low_bits
327 str eax, [dictionary,edx,lsl #2] // *dict_location = newly formed word
328 str eax, [dest_buf], #4 // *dest_buf++ = newly formed word
329 bgt L_next // repeat loop until next_tag==tag_area_end
333 add sp, sp, #64 // deallocate for dictionary
335 // release stack memory, restore registers, and return
337 vld1.64 {q0,q1},[sp]!
338 vld1.64 {q2,q3},[sp]!
339 vld1.64 {q4,q5},[sp]!
347 this is a dictionary miss.
351 dictionary[k] = *dest_buf++ = x;
353 subs tags_counter,tags_counter,#1 // tags_counter--
354 ldr eax, [r10], #4 // w = *next_full_patt++
355 lsr edx, eax, #10 // w>>10
356 str eax, [dest_buf], #4 // *dest_buf++ = word
357 and edx, edx, #0x0ff // 8-bit hash table index
358 ldrb edx, [ebx, edx] // qpos
359 str eax, [dictionary,edx] // dictionary[qpos] = word
360 bgt L_next // repeat the loop
361 b L_done // if next_tag >= tag_area_end, we're done
367 this is an exact match;
368 *dest_buf++ = dictionary[*dict_index++];
371 ldrb eax, [r8], #1 // qpos = *next_qpos++
372 subs tags_counter,tags_counter,#1 // tags_counter--
373 ldr eax, [dictionary,eax,lsl #2] // w = dictionary[qpos]
374 str eax, [dest_buf], #4 // *dest_buf++ = w
375 bgt L_next // repeat the loop
376 b L_done // if next_tag >= tag_area_end, we're done
412 #if defined(KERNEL) && !SLIDABLE
415 .long _hashLookupTable_new
419 .long L_Tab$non_lazy_ptr-(L_table0+8)
421 .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
424 .indirect_symbol _hashLookupTable_new