]> git.saurik.com Git - apple/xnu.git/blob - osfmk/x86_64/WKdmDecompress_new.s
xnu-3248.40.184.tar.gz
[apple/xnu.git] / osfmk / x86_64 / WKdmDecompress_new.s
1 /*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor.
31
32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
33
34 input :
35 src_buf : address of input compressed data buffer
36 dest_buf : address of output decompressed buffer
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38 words : this argument is not used in the implementation
39
40 output :
41
42 the input buffer is decompressed and the dest_buf is written with decompressed data.
43
44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s
45
46 The bit stream (*src_buf) consists of
47 a. 12 bytes header
48 b. 256 bytes for 1024 packed tags
49 c. (varying number of) words for new words not matched to dictionary word.
50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
52
53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
54
55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
56
57 for (i=0;i<1024;i++) {
58 tag = *next_tag++
59 switch (tag) {
60 case 0 : *dest_buf++ = 0; break;
61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break;
64 }
65
66 cclee, 11/30/12
67
68 Added zero page, single value page, sparse page, early abort optimizations
69 rsrini, 09/14/14
70 */
71
72 #define MZV_MAGIC $17185 // magic value used to identify MZV page encoding
73
74 .text
75
76 .globl _WKdm_decompress_new
77 _WKdm_decompress_new:
78
79 // save registers, and allocate stack memory for local variables
80
81 pushq %rbp
82 movq %rsp, %rbp
83 pushq %r12
84 pushq %r13
85 pushq %rbx
86
87 subq $(64+8+16), %rsp
88
89 movl 0(%rdi), %eax // read the 1st word from the header
90 cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)?
91 jne L_default_decompressor // default decompressor was used
92
93 // Mostly Zero Page Handling...
94 // {
95 movq $0, %rax
96 1: // Zero out the entire page
97 movq $0, 0(%rsi, %rax)
98 movq $0, 8(%rsi, %rax)
99 movq $0, 16(%rsi, %rax)
100 movq $0, 24(%rsi, %rax)
101 movq $0, 32(%rsi, %rax)
102 movq $0, 40(%rsi, %rax)
103 movq $0, 48(%rsi, %rax)
104 movq $0, 56(%rsi, %rax)
105 addq $64, %rax
106 cmpq $4096, %rax
107 jne 1b
108
109 movq $4, %r12 // current byte position in src to read from
110 2:
111 movl 0(%rdi, %r12), %eax // get the word
112 movzwq 4(%rdi, %r12), %rdx // get the index
113 movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer
114 addq $6, %r12 // 6 more bytes processed
115 cmpl %ecx, %r12d // finished processing all the bytes?
116 jne 2b
117 jmp L_done
118 // }
119
120 L_default_decompressor:
121
122 movq %rsi, %r12 // dest_buf
123 movq %rdx, %r13 // scratch_buf
124
125 // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp)
126 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
127 #if 1
128 movl $0, 0(%rsp)
129 movl $0, 4(%rsp)
130 movl $0, 8(%rsp)
131 movl $0, 12(%rsp)
132 movl $0, 16(%rsp)
133 movl $0, 20(%rsp)
134 movl $0, 24(%rsp)
135 movl $0, 28(%rsp)
136 movl $0, 32(%rsp)
137 movl $0, 36(%rsp)
138 movl $0, 40(%rsp)
139 movl $0, 44(%rsp)
140 movl $0, 48(%rsp)
141 movl $0, 52(%rsp)
142 movl $0, 56(%rsp)
143 movl $0, 60(%rsp)
144 #else
145 mov $0x100000001, %rax
146 mov %rax, (%rsp)
147 mov %rax, 8(%rsp)
148 mov %rax, 16(%rsp)
149 mov %rax, 24(%rsp)
150 mov %rax, 32(%rsp)
151 mov %rax, 40(%rsp)
152 mov %rax, 48(%rsp)
153 mov %rax, 56(%rsp)
154 #endif
155
156 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
157
158 leaq 268(%rdi), %r10 // TAGS_AREA_END
159 leaq 12(%rdi), %rax // TAGS_AREA_START
160 movq %r13, %rsi // tempTagsArray
161 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START
162 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
163 movq %r13, %rcx // next_word
164 xorl %r8d, %r8d // i = 0
165 mov $(50529027<<32)+50529027, %r9
166 L_WK_unpack_2bits:
167 movl 12(%rdi,%r8, 4), %eax
168 movl 12(%rdi,%r8, 4), %edx
169 shrl $2, %eax
170 shlq $32, %rax
171 orq %rdx, %rax
172 movq %rax, %rdx
173 shrq $4, %rax
174 andq %r9, %rdx
175 andq %r9, %rax
176 incq %r8 // i++
177 movq %rdx, (%rcx)
178 movq %rax, 8(%rcx)
179 addq $16, %rcx // next_tags += 16
180 cmpq $64, %r8 // i vs 64
181 jne L_WK_unpack_2bits // repeat loop until i==64
182 1:
183
184
185 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
186
187 mov 4(%rdi), %eax // WKdm header qpos end
188 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END
189 mov 0(%rdi), %eax // WKdm header qpos start
190 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START
191 leaq 1024(%r13), %rbx // tempQPosArray
192 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
193 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
194 leaq 8(%rbx), %rcx // next_qpos
195
196 mov $(252645135<<32)+252645135, %r11
197 L_WK_unpack_4bits:
198 movl (%r8), %eax // w = *next_word
199 movl %eax, %edx // w
200 shlq $28, %rax
201 orq %rdx, %rax
202 addq $4, %r8 // next_word++
203 andq %r11, %rax
204 movq %rax, -8(%rcx)
205 addq $8, %rcx // next_qpos+=8
206 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START
207 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START
208
209
210 1:
211
212 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
213
214 movl 8(%rdi), %eax // LOW_BITS_AREA_END offset
215 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END
216 leaq 2048(%r13), %r11 // tempLowBitsArray
217 leaq 4094(%r13), %r13 // final tenbits addr
218 sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END
219 jle 1f // if START>=END, skip L_WK_unpack_3_tenbits
220 movq %r11, %rcx // next_low_bits
221 L_WK_unpack_3_tenbits:
222 movl (%r9), %eax // w = *next_word, 0:c:b:a
223 movl $(1023<<10), %edx
224 movl $(1023<<20), %r8d
225 andl %eax, %edx // b << 10
226 andl %eax, %r8d // c << 20
227 andq $1023, %rax
228 shll $6, %edx
229 shlq $12, %r8
230 orl %edx, %eax
231 orq %r8, %rax
232 cmp %r13, %rcx
233 je 2f
234 mov %rax, (%rcx)
235 jmp 3f
236 2: mov %ax, (%rcx)
237 3:
238 addq $4, %r9 // next_word++
239 addq $6, %rcx // next_low_bits += 3
240 sub $4, %rdi
241 jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word
242 1:
243
244
245 #define next_qpos %rbx
246 #define hash %r8
247 #define tags_counter %edi
248 #define dest_buf %r12
249 #define next_full_patt %r10
250
251 leaq _hashLookupTable_new(%rip), hash // hash look up table
252 movl $1024, tags_counter // tags_counter
253 jmp L_next
254
255 .align 4,0x90
256 L_nonpartital:
257 jl L_ZERO_TAG
258 cmpb $2, -1(%rsi)
259 je L_MISS_TAG
260
261 L_EXACT_TAG:
262 movzbl (next_qpos), %eax // qpos = *next_qpos
263 incq next_qpos // next_qpos++
264 decl tags_counter // tags_counter--
265 movl (%rsp,%rax,4), %eax // w = dictionary[qpos]
266 movl %eax, -4(dest_buf) // *dest_buf = w
267 je L_done
268
269 L_next:
270 incq %rsi // next_tag++
271 addq $4, dest_buf
272 cmpb $1, -1(%rsi)
273 jne L_nonpartital
274
275 L_PARTIAL_TAG:
276 movzbl (next_qpos),%edx // qpos = *next_qpos
277 incq next_qpos // next_qpos++
278 movl (%rsp,%rdx,4), %eax // read dictionary word
279 andl $-1024, %eax // clear lower 10 bits
280 or (%r11), %ax // pad the lower 10-bits from *next_low_bits
281 addq $2, %r11 // next_low_bits++
282 decl tags_counter // tags_counter--
283 movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word
284 movl %eax, -4(dest_buf) // *dest_buf = newly formed word
285 jg L_next // repeat loop until next_tag==tag_area_end
286
287 L_done:
288
289 // release stack memory, restore registers, and return
290
291 addq $(64+8+16), %rsp
292 popq %rbx
293 popq %r13
294 popq %r12
295 leave
296 ret
297
298 .align 4,0x90
299 L_MISS_TAG:
300 movl (next_full_patt), %edx // w = *next_full_patt
301 movl (next_full_patt), %eax // w = *next_full_patt
302 shrl $10, %edx // w>>10
303 addq $4, next_full_patt // next_full_patt++
304 movzbl %dl, %edx // 8-bit hash table index
305 movl %eax, -4(dest_buf) // *dest_buf = word
306 movzbl (hash,%rdx),%edx // qpos
307 decl tags_counter // tags_counter--
308 movl %eax, (%rsp,%rdx) // dictionary[qpos] = word
309 jg L_next // repeat the loop
310 jmp L_done
311
312 .align 4,0x90
313 L_ZERO_TAG:
314 decl tags_counter // tags_counter--
315 movl $0, -4(dest_buf) // *dest_buf = 0
316 jg L_next // repeat the loop
317 jmp L_done
318
319