]>
Commit | Line | Data |
---|---|---|
39236c6e A |
1 | /* |
2 | * Copyright (c) 2000-2013 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains x86_64 hand optimized implementation of WKdm memory page decompressor. | |
31 | ||
32 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); | |
33 | ||
34 | input : | |
35 | src_buf : address of input compressed data buffer | |
36 | dest_buf : address of output decompressed buffer | |
37 | scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller | |
38 | words : this argument is not used in the implementation | |
39 | ||
40 | output : | |
41 | ||
42 | the input buffer is decompressed and the dest_buf is written with decompressed data. | |
43 | ||
44 | Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s | |
45 | ||
46 | The bit stream (*src_buf) consists of | |
47 | a. 12 bytes header | |
48 | b. 256 bytes for 1024 packed tags | |
49 | c. (varying number of) words for new words not matched to dictionary word. | |
50 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
51 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
52 | ||
53 | where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. | |
54 | ||
55 | The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows | |
56 | ||
57 | for (i=0;i<1024;i++) { | |
58 | tag = *next_tag++ | |
59 | switch (tag) { | |
60 | case 0 : *dest_buf++ = 0; break; | |
61 | case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; | |
62 | case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; | |
63 | case 3 : *dest_buf++ = dictionary[*dict_index++]; break; | |
64 | } | |
65 | ||
66 | cclee, 11/30/12 | |
67 | */ | |
68 | ||
69 | .text | |
70 | ||
71 | .globl _WKdm_decompress_new | |
72 | _WKdm_decompress_new: | |
73 | ||
74 | // save registers, and allocate stack memory for local variables | |
75 | ||
76 | pushq %rbp | |
77 | movq %rsp, %rbp | |
78 | pushq %r12 | |
79 | pushq %r13 | |
80 | pushq %rbx | |
81 | ||
82 | subq $(64+8+16), %rsp | |
83 | ||
84 | movq %rsi, %r12 // dest_buf | |
85 | movq %rdx, %r13 // scracht_buf | |
86 | ||
87 | // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp) | |
88 | #if 1 | |
89 | movl $1, 0(%rsp) | |
90 | movl $1, 4(%rsp) | |
91 | movl $1, 8(%rsp) | |
92 | movl $1, 12(%rsp) | |
93 | movl $1, 16(%rsp) | |
94 | movl $1, 20(%rsp) | |
95 | movl $1, 24(%rsp) | |
96 | movl $1, 28(%rsp) | |
97 | movl $1, 32(%rsp) | |
98 | movl $1, 36(%rsp) | |
99 | movl $1, 40(%rsp) | |
100 | movl $1, 44(%rsp) | |
101 | movl $1, 48(%rsp) | |
102 | movl $1, 52(%rsp) | |
103 | movl $1, 56(%rsp) | |
104 | movl $1, 60(%rsp) | |
105 | #else | |
106 | mov $0x100000001, %rax | |
107 | mov %rax, (%rsp) | |
108 | mov %rax, 8(%rsp) | |
109 | mov %rax, 16(%rsp) | |
110 | mov %rax, 24(%rsp) | |
111 | mov %rax, 32(%rsp) | |
112 | mov %rax, 40(%rsp) | |
113 | mov %rax, 48(%rsp) | |
114 | mov %rax, 56(%rsp) | |
115 | #endif | |
116 | ||
117 | // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); | |
118 | ||
119 | leaq 268(%rdi), %r10 // TAGS_AREA_END | |
120 | leaq 12(%rdi), %rax // TAGS_AREA_START | |
121 | movq %r13, %rsi // tempTagsArray | |
122 | cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START | |
123 | jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits | |
124 | movq %r13, %rcx // next_word | |
125 | xorl %r8d, %r8d // i = 0 | |
126 | mov $(50529027<<32)+50529027, %r9 | |
127 | L_WK_unpack_2bits: | |
128 | movl 12(%rdi,%r8, 4), %eax | |
129 | movl 12(%rdi,%r8, 4), %edx | |
130 | shrl $2, %eax | |
131 | shlq $32, %rax | |
132 | orq %rdx, %rax | |
133 | movq %rax, %rdx | |
134 | shrq $4, %rax | |
135 | andq %r9, %rdx | |
136 | andq %r9, %rax | |
137 | incq %r8 // i++ | |
138 | movq %rdx, (%rcx) | |
139 | movq %rax, 8(%rcx) | |
140 | addq $16, %rcx // next_tags += 16 | |
141 | cmpq $64, %r8 // i vs 64 | |
142 | jne L_WK_unpack_2bits // repeat loop until i==64 | |
143 | 1: | |
144 | ||
145 | ||
146 | // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); | |
147 | ||
148 | mov 4(%rdi), %eax // WKdm header qpos end | |
149 | leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END | |
150 | mov 0(%rdi), %eax // WKdm header qpos start | |
151 | leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START | |
152 | leaq 1024(%r13), %rbx // tempQPosArray | |
153 | cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START | |
154 | jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits | |
155 | leaq 8(%rbx), %rcx // next_qpos | |
156 | ||
157 | mov $(252645135<<32)+252645135, %r11 | |
158 | L_WK_unpack_4bits: | |
159 | movl (%r8), %eax // w = *next_word | |
160 | movl %eax, %edx // w | |
161 | shlq $28, %rax | |
162 | orq %rdx, %rax | |
163 | addq $4, %r8 // next_word++ | |
164 | andq %r11, %rax | |
165 | movq %rax, -8(%rcx) | |
166 | addq $8, %rcx // next_qpos+=8 | |
167 | cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START | |
168 | ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START | |
169 | ||
170 | ||
171 | 1: | |
172 | ||
173 | // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); | |
174 | ||
175 | movl 8(%rdi), %eax // LOW_BITS_AREA_END offset | |
176 | leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END | |
177 | leaq 2048(%r13), %r11 // tempLowBitsArray | |
178 | leaq 4094(%r13), %r13 // final tenbits addr | |
179 | sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END | |
180 | jle 1f // if START>=END, skip L_WK_unpack_3_tenbits | |
181 | movq %r11, %rcx // next_low_bits | |
182 | L_WK_unpack_3_tenbits: | |
183 | movl (%r9), %eax // w = *next_word, 0:c:b:a | |
184 | movl $(1023<<10), %edx | |
185 | movl $(1023<<20), %r8d | |
186 | andl %eax, %edx // b << 10 | |
187 | andl %eax, %r8d // c << 20 | |
188 | andq $1023, %rax | |
189 | shll $6, %edx | |
190 | shlq $12, %r8 | |
191 | orl %edx, %eax | |
192 | orq %r8, %rax | |
193 | cmp %r13, %rcx | |
194 | je 2f | |
195 | mov %rax, (%rcx) | |
196 | jmp 3f | |
197 | 2: mov %ax, (%rcx) | |
198 | 3: | |
199 | addq $4, %r9 // next_word++ | |
200 | addq $6, %rcx // next_low_bits += 3 | |
201 | sub $4, %rdi | |
202 | jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word | |
203 | 1: | |
204 | ||
205 | ||
206 | #define next_qpos %rbx | |
207 | #define hash %r8 | |
208 | #define tags_counter %edi | |
209 | #define dest_buf %r12 | |
210 | #define next_full_patt %r10 | |
211 | ||
212 | leaq _hashLookupTable_new(%rip), hash // hash look up table | |
213 | movl $1024, tags_counter // tags_counter | |
214 | jmp L_next | |
215 | ||
216 | .align 4,0x90 | |
217 | L_nonpartital: | |
218 | jl L_ZERO_TAG | |
219 | cmpb $2, -1(%rsi) | |
220 | je L_MISS_TAG | |
221 | ||
222 | L_EXACT_TAG: | |
223 | movzbl (next_qpos), %eax // qpos = *next_qpos | |
224 | incq next_qpos // next_qpos++ | |
225 | decl tags_counter // tags_counter-- | |
226 | movl (%rsp,%rax,4), %eax // w = dictionary[qpos] | |
227 | movl %eax, -4(dest_buf) // *dest_buf = w | |
228 | je L_done | |
229 | ||
230 | L_next: | |
231 | incq %rsi // next_tag++ | |
232 | addq $4, dest_buf | |
233 | cmpb $1, -1(%rsi) | |
234 | jne L_nonpartital | |
235 | ||
236 | L_PARTIAL_TAG: | |
237 | movzbl (next_qpos),%edx // qpos = *next_qpos | |
238 | incq next_qpos // next_qpos++ | |
239 | movl (%rsp,%rdx,4), %eax // read dictionary word | |
240 | andl $-1024, %eax // clear lower 10 bits | |
241 | or (%r11), %ax // pad the lower 10-bits from *next_low_bits | |
242 | addq $2, %r11 // next_low_bits++ | |
243 | decl tags_counter // tags_counter-- | |
244 | movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word | |
245 | movl %eax, -4(dest_buf) // *dest_buf = newly formed word | |
246 | jg L_next // repeat loop until next_tag==tag_area_end | |
247 | ||
248 | L_done: | |
249 | ||
250 | // release stack memory, restore registers, and return | |
251 | ||
252 | addq $(64+8+16), %rsp | |
253 | popq %rbx | |
254 | popq %r13 | |
255 | popq %r12 | |
256 | leave | |
257 | ret | |
258 | ||
259 | .align 4,0x90 | |
260 | L_MISS_TAG: | |
261 | movl (next_full_patt), %edx // w = *next_full_patt | |
262 | movl (next_full_patt), %eax // w = *next_full_patt | |
263 | shrl $10, %edx // w>>10 | |
264 | addq $4, next_full_patt // next_full_patt++ | |
265 | movzbl %dl, %edx // 8-bit hash table index | |
266 | movl %eax, -4(dest_buf) // *dest_buf = word | |
267 | movzbl (hash,%rdx),%edx // qpos | |
268 | decl tags_counter // tags_counter-- | |
269 | movl %eax, (%rsp,%rdx) // dictionary[qpos] = word | |
270 | jg L_next // repeat the loop | |
271 | jmp L_done | |
272 | ||
273 | .align 4,0x90 | |
274 | L_ZERO_TAG: | |
275 | decl tags_counter // tags_counter-- | |
276 | movl $0, -4(dest_buf) // *dest_buf = 0 | |
277 | jg L_next // repeat the loop | |
278 | jmp L_done | |
279 | ||
280 |