]>
Commit | Line | Data |
---|---|---|
5ba3f43e A |
1 | /* |
2 | * Copyright (c) 2000-2013 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | /* | |
30 | This file contains armv7 hand optimized implementation of WKdm memory page compressor. | |
31 | ||
32 | int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); | |
33 | ||
34 | input : | |
35 | src_buf : address of input page (length = 1024 words) | |
36 | dest_buf : address of output buffer (may not be 16-byte aligned) | |
37 | scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller, | |
38 | bytes_budget : a given byte target in compression | |
39 | ||
40 | output : | |
41 | ||
42 | if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data | |
43 | o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget. | |
44 | During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area. | |
45 | When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined. | |
46 | The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault. | |
47 | ||
48 | The 4th argument bytes_budget is the target compress budget in bytes. | |
49 | Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes. | |
50 | Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed). | |
51 | ||
52 | WKdm Compression algorithm is briefly stated as follows: | |
53 | ||
54 | There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function. | |
55 | For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits). | |
56 | a. k = (x>>10)&255; // 8-bit hash table index | |
57 | b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed | |
58 | c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated | |
59 | ||
60 | Each input word x is classified/tagged into 4 classes : | |
61 | 0 : x = 0 | |
62 | 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word | |
63 | 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss | |
64 | 3 : (x == dict_word), the exact input word is in the dictionary | |
65 | ||
66 | For each class, different numbers of bits are needed for the decompressor to reproduce the original input word. | |
67 | 0 : 2-bits tag (32->2 compression) | |
68 | 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression) | |
69 | 2 : 2-bits tag + 32-bits new word (32->34 expansion) | |
70 | 3 : 2-bits tag + 4-bits dict_index (32->6 compression) | |
71 | ||
72 | It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6). | |
73 | ||
74 | the output bit stream (*dest_buf) consists of | |
75 | a. 12 bytes header | |
76 | b. 256 bytes for 1024 packed tags | |
77 | c. (varying number of) words for new words not matched to dictionary word. | |
78 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) | |
79 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) | |
80 | ||
81 | the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively. | |
82 | Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word. | |
83 | ||
84 | ||
85 | The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer. | |
86 | ||
87 | The temp buffers are | |
88 | ||
89 | uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing | |
90 | uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing | |
91 | uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing | |
92 | ||
93 | Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to | |
94 | the destination buffer. | |
95 | ||
96 | uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags. | |
97 | ||
98 | Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass. | |
99 | ||
100 | byte_count = bytes_budget - 12 - 256; // header + tags | |
101 | ||
102 | whenever an input word is classified as class | |
103 | ||
104 | 2 : byte_count -= 4; | |
105 | ||
106 | in 4-bit/10-bit packing, we can also return -1 when byte_budget <=0; | |
107 | ||
108 | Note : since there might be extra padding bits for class 1 and 3, it is complicated to track this padding bits on the fly. To compromise, we change class 1 to | |
109 | ||
110 | without showing the bit budget management, the pseudo code is given as follows: | |
111 | ||
112 | uint8_t *tags=tempTagsArray; | |
113 | uint8_t *dict=tempQPosArray; | |
114 | uint8_t *partial=tempLowBitsArray; | |
115 | ||
116 | for (i=0;i<1024;i++) { | |
117 | x = *src_buf++; | |
118 | if (x == 0) { // zero, 2-bits tag | |
119 | *tags++ = 0; | |
120 | } else { | |
121 | ||
122 | // find dict_index and dict_word from x | |
123 | k = (x>>10)&255; | |
124 | dict_index = hashTable[k]; | |
125 | dict_word = dictionary[dict_index]; | |
126 | ||
127 | if (dict_word == x) { // exactly match | |
128 | // 2-bits tag + 4-bits table index | |
129 | *tags++ = 3; | |
130 | *dict++ = dict_index; | |
131 | } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched | |
132 | // 2-bits tag + 4-bits table index + 10-bits lower partial | |
133 | *tags++ = 1; | |
134 | *dict++ = dict_index; | |
135 | *partial++ = x &0x3ff; | |
136 | dictionary[dict_index] = x; | |
137 | } else { // not matched | |
138 | // 2-bits tag + 32-bits new word | |
139 | *tags++ = 2; | |
140 | *new_word++ = x; | |
141 | dictionary[dict_index] = x; | |
142 | } | |
143 | } | |
144 | } | |
145 | ||
146 | after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf: | |
147 | ||
148 | 1. 1024 tags are packed into 256 bytes right after the 12-bytes header | |
149 | 2. dictionary indices (4-bits each) are packed into are right after the new words section | |
150 | 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section. | |
151 | ||
152 | cclee, 11/9/12 | |
153 | ||
154 | Added zero page, single value page, sparse page, early abort optimizations | |
155 | rsrini, 09/14/14 | |
156 | */ | |
157 | .text | |
158 | .align 4 | |
159 | ||
160 | // int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); | |
161 | ||
162 | .globl _WKdm_compress_new | |
163 | _WKdm_compress_new: | |
164 | ||
165 | /* | |
166 | ------------------------- symbolizing register use ----------------------------------- | |
167 | */ | |
168 | ||
169 | #define src_buf r0 | |
170 | #define next_input_word r0 | |
171 | #define dest_buf r1 | |
172 | #define scratch r2 | |
173 | #define dictionary sp | |
174 | #define byte_count r3 | |
175 | ||
176 | #define next_tag r12 | |
177 | ||
178 | #define remaining r4 | |
179 | #define next_full_patt r5 | |
180 | #define dict_location r6 | |
181 | #define next_qp r8 | |
182 | #define hashTable r9 | |
183 | #define next_low_bits r10 | |
184 | #define eax r11 | |
185 | #define ecx r12 | |
186 | #define edx lr | |
187 | #define rdi r6 | |
188 | ||
189 | #define tempTagsArray scratch | |
190 | #define R11 r0 // only safe to use between phase-1 and phase-2 | |
191 | #define R13 r4 // only safe to use between phase-1 and phase-2 | |
192 | /* | |
193 | ------------------------- allocate scratch memory for local use -------------------------------------- | |
194 | ||
195 | need 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*2 (tempLowBitsArray) | |
196 | total 4096 bytes | |
197 | [scratch,#0] : tempTagsArray | |
198 | [scratch,#1024] : tempQPosArray | |
199 | [scratch,#2048] : tempLowBitsArray | |
200 | ||
201 | [sp,#0] : dictionary | |
202 | ||
203 | */ | |
204 | ||
205 | #define TagsArray_offset 0 | |
206 | #define QPosArray_offset 1024 | |
207 | #define LowBitsArray_offset 2048 | |
208 | ||
209 | #define SV_RETURN 0 // return value when SV, ZV page is found | |
210 | #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding | |
211 | #define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096] | |
212 | #define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words | |
213 | #define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data | |
214 | #define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing .. | |
215 | // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096 | |
216 | #if CHKPT_BYTES > 4096 | |
217 | #error CHKPT_BYTES must be <= 4096 | |
218 | #endif | |
219 | #if CHKPT_BYTES < 4 | |
220 | #error CHKPT_BYTES must be >= 4 | |
221 | #endif | |
222 | ||
223 | push {r7,lr} | |
224 | mov r7, sp | |
225 | push {r4-r6,r8-r11} | |
226 | ||
227 | #if KERNEL | |
228 | sub sp, sp, #32 | |
229 | vst1.64 {q0,q1}, [sp] | |
230 | #endif | |
231 | ||
232 | sub sp, sp, #(64+24) // reserve stack space for temps + dictionary | |
233 | ||
234 | /* | |
235 | ----- set up registers and initialize WKdm dictionary ---------- | |
236 | */ | |
237 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO | |
238 | // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES | |
239 | mov eax, #0 | |
240 | ||
241 | mov next_tag, scratch // &tempTagsArray[0] | |
242 | vdup.32 q0, eax | |
243 | ||
244 | add next_qp, scratch, #QPosArray_offset // next_qp | |
245 | mov lr, sp | |
246 | mov remaining, #(CHKPT_WORDS) // remaining input words .. initially set to checkpoint | |
247 | vst1.64 {q0}, [lr]! | |
248 | add next_full_patt, dest_buf, #268 // dest_buf + [TAGS_AREA_OFFSET + (4096 / 16)]*4 | |
249 | vst1.64 {q0}, [lr]! | |
250 | vst1.64 {q0}, [lr]! | |
251 | add next_low_bits, scratch, #LowBitsArray_offset // &tempLowBitsArray[0] | |
252 | vst1.64 {q0}, [lr]! | |
253 | ||
254 | #if defined(KERNEL) && !SLIDABLE | |
255 | adr hashTable, L_table | |
256 | ldr hashTable, [hashTable] | |
257 | #else | |
258 | ldr hashTable, L_table | |
259 | L_table0: | |
260 | ldr hashTable, [pc, hashTable] | |
261 | #endif | |
262 | ||
263 | #define EARLYCHECK 0 | |
264 | #define NORMAL 1 | |
265 | ||
266 | #define mode [sp, #64] | |
267 | #define start_next_full_patt [sp, #68] | |
268 | #define start_next_input_word [sp, #72] | |
269 | #define start_next_low_bits [sp, #76] | |
270 | #define byte_budget [sp, #80] | |
271 | ||
272 | mov edx, #EARLYCHECK | |
273 | str edx, mode // indicate we are yet to evaluate the early aborts | |
274 | str next_full_patt, start_next_full_patt // remember the start of next_full_patt | |
275 | str next_input_word, start_next_input_word // remember the start of next_input_word | |
276 | str next_low_bits, start_next_low_bits // remember the start of next_low_bits | |
277 | str byte_count, byte_budget // remember the byte budget | |
278 | ||
279 | sub byte_count, byte_count, #(12+256) // byte_count - header bytes - tags bytes | |
280 | b L_scan_loop | |
281 | ||
282 | .align 4, 0x90 | |
283 | L_RECORD_ZERO: | |
284 | /* we've just detected a zero input word in edx */ | |
285 | strb edx, [next_tag], #1 // *next_tag++ = ZERO; | |
286 | subs remaining, remaining, #1 // remaining input words | |
287 | ble CHECKPOINT // if remaining = 0, break | |
288 | ||
289 | /* WKdm compress scan/tag loop */ | |
290 | L_scan_loop: | |
291 | ldr edx, [next_input_word], #4 | |
292 | cmp edx, #0 | |
293 | beq L_RECORD_ZERO // if (input_word==0) RECORD_ZERO | |
294 | ||
295 | /* | |
296 | now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location | |
297 | */ | |
298 | and eax, edx, #(0xff<<10) // part of input_word for hash table index | |
299 | lsr eax, eax, #10 // 8-bit index to the Hash Table | |
300 | ldrb eax, [hashTable, eax] // HASH_TO_DICT_BYTE_OFFSET(input_word) | |
301 | add dict_location, dictionary, eax // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word)); | |
302 | ldr eax, [dictionary, eax] // dict_word = *dict_location; | |
303 | cmp eax, edx // dict_word vs input_word | |
304 | beq L_RECORD_EXACT // if identical, RECORD_EXACT | |
305 | ||
306 | eor eax, eax, edx | |
307 | lsrs eax, eax, #10 // HIGH_BITS(dict_word) | |
308 | beq L_RECORD_PARTIAL // if identical, RECORD_PARTIAL | |
309 | ||
310 | L_RECORD_MISS: | |
311 | /* | |
312 | if we are here, the input word can not be derived from the dictionary, | |
313 | we write the input word as a new word, | |
314 | and update the dictionary with this new word | |
315 | */ | |
316 | ||
317 | subs byte_count, byte_count, #4 | |
318 | ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable | |
319 | str edx, [next_full_patt], #4 // *next_full_patt++ = input_word; | |
320 | mov eax, #2 | |
321 | str edx, [dict_location] // *dict_location = input_word | |
322 | strb eax, [next_tag], #1 // *next_tag++ = 2 for miss | |
323 | subs remaining, remaining, #1 // remaining input words | |
324 | bgt L_scan_loop // if bit_count>0, go on the scan/tag pass, | |
325 | b CHECKPOINT | |
326 | ||
327 | L_done_search: | |
328 | ||
329 | // SET_QPOS_AREA_START(dest_buf,next_full_patt); | |
330 | sub eax, next_full_patt, dest_buf // next_full_patt - dest_buf | |
331 | lsr eax, eax, #2 // offset in 4-bytes | |
332 | str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf | |
333 | ||
334 | ||
335 | /* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/ | |
336 | // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); | |
337 | ||
338 | add rdi, dest_buf, #12 // dest_buf | |
339 | mov eax, scratch // &tempTagsArray[0] | |
340 | sub edx, next_tag, scratch // this should be 1024 | |
341 | ||
342 | vld1.64 {q0,q1}, [eax,:128]! | |
343 | subs edx, edx, #32 // pre-decrement by 32 | |
344 | L_pack_2bits: | |
345 | subs edx, edx, #32 | |
346 | vshl.i64 d1, d1, #4 | |
347 | vshl.i64 d3, d3, #4 | |
348 | vorr d0, d0, d1 | |
349 | vorr d2, d2, d3 | |
350 | vshr.u64 d1, d0, #30 | |
351 | vshr.u64 d3, d2, #30 | |
352 | vorr d0, d0, d1 | |
353 | vorr d2, d2, d3 | |
354 | vzip.32 d0, d2 | |
355 | vst1.64 {d0}, [rdi]! | |
356 | vld1.64 {q0,q1}, [eax,:128]! | |
357 | bgt L_pack_2bits | |
358 | vshl.i64 d1, d1, #4 | |
359 | vshl.i64 d3, d3, #4 | |
360 | vorr d0, d0, d1 | |
361 | vorr d2, d2, d3 | |
362 | vshr.u64 d1, d0, #30 | |
363 | vshr.u64 d3, d2, #30 | |
364 | vorr d0, d0, d1 | |
365 | vorr d2, d2, d3 | |
366 | vzip.32 d0, d2 | |
367 | vst1.64 {d0}, [rdi] | |
368 | ||
369 | ||
370 | /* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */ | |
371 | ||
372 | /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */ | |
373 | add ecx, scratch, #QPosArray_offset // tempQPosArray | |
374 | sub eax, next_qp, ecx // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray; | |
375 | add eax, eax, #7 // num_bytes_to_pack+7 | |
376 | lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3 | |
377 | subs byte_count, byte_count, eax, lsl #2 // byte_count -= 4 * packed_words | |
378 | blt L_budgetExhausted // o.w., return -1 to signal this page is not compressable | |
379 | add ecx, ecx, eax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words | |
380 | cmp ecx, next_qp // endQPosArray vs next_qp | |
381 | bls L16 // if (next_qp >= endQPosArray) skip the following zero paddings | |
382 | sub eax, ecx, next_qp | |
383 | mov edx, #0 | |
384 | tst eax, #4 | |
385 | beq 1f | |
386 | str edx, [next_qp], #4 | |
387 | 1: tst eax, #2 | |
388 | beq 1f | |
389 | strh edx, [next_qp], #2 | |
390 | 1: tst eax, #1 | |
391 | beq 1f | |
392 | strb edx, [next_qp], #1 | |
393 | 1: | |
394 | L16: | |
395 | add edx, scratch, #QPosArray_offset // tempQPosArray | |
396 | mov rdi, next_full_patt // next_full_patt | |
397 | cmp ecx, edx // endQPosArray vs tempQPosArray | |
398 | ldr eax, [dest_buf] | |
399 | bls L20 // if (endQPosArray <= tempQPosArray) skip the following | |
400 | ||
401 | /* packing 4-bits dict indices into dest_buf */ | |
402 | L_pack_4bits: | |
403 | vld1.64 {d0}, [edx,:64]! // src_next[1]:src_next[0] | |
404 | vshr.u64 d1, d0, #28 // (src_next[1] << 4) | |
405 | vorr d0, d0, d1 // src_next[0] | (src_next[1] << 4) | |
406 | cmp ecx, edx // source_end vs src_next | |
407 | vstr s0, [rdi] | |
408 | add rdi, rdi, #4 | |
409 | bhi L_pack_4bits // while (src_next < source_end) repeat the loop | |
410 | ||
411 | /* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */ | |
412 | // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); | |
413 | sub eax, rdi, dest_buf // boundary_tmp - dest_buf | |
414 | lsr eax, eax, #2 // boundary_tmp - dest_buf in words | |
415 | L20: | |
416 | str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf | |
417 | ||
418 | add ecx, scratch, #LowBitsArray_offset // tempLowBitsArray | |
419 | sub edx, next_low_bits, ecx // next_low_bits - tempLowBitsArray (in bytes) | |
420 | lsr edx, edx, #1 // num_tenbits_to_pack (in half-words) | |
421 | subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3 | |
422 | blt 1f // if num_tenbits_to_pack < 3, skip the following loop | |
423 | 0: | |
424 | subs byte_count, byte_count, #4 // byte_count -= 4 | |
425 | ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable | |
426 | ldr r4,[ecx, #2] // w2:6bits:w1 | |
427 | ldrh r0,[ecx], #6 // w0 | |
428 | uxth r5, r4, ror #16 // w2 | |
429 | uxth r4, r4 // w1 | |
430 | orr r0, r0, r4, lsl #10 // w1:w0 | |
431 | subs edx, edx, #3 // num_tenbits_to_pack-=3 | |
432 | orr r0, r0, r5, lsl #20 // w2:w1:w0 | |
433 | str r0, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word | |
434 | bge 0b // if no less than 3 elements, back to loop head | |
435 | ||
436 | 1: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3 | |
437 | beq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following | |
438 | subs byte_count, byte_count, #4 // byte_count -= 4 | |
439 | ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable | |
440 | ldrh eax,[ecx] // w0 | |
441 | subs edx, edx, #1 // num_tenbits_to_pack-- | |
442 | beq 2f // | |
443 | ldrh edx, [ecx, #2] // w1 | |
444 | orr eax, eax, edx, lsl #10 // w0 | (w1<<10) | |
445 | ||
446 | 2: str eax, [rdi], #4 // write the final dest_buf word | |
447 | ||
448 | 3: sub eax, rdi, dest_buf // boundary_tmp - dest_buf | |
449 | lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words | |
450 | str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) | |
451 | lsl r0, eax, #2 // boundary_tmp - dest_buf in terms of bytes | |
452 | ||
453 | L_done: | |
454 | // restore registers and return | |
455 | ||
456 | add sp, sp, #(64+24) // skip memory for temps + dictionary | |
457 | #if KERNEL | |
458 | vld1.64 {q0,q1}, [sp]! | |
459 | #endif | |
460 | pop {r4-r6,r8-r11} | |
461 | pop {r7,pc} | |
462 | ||
463 | .align 4 | |
464 | L_budgetExhausted: | |
465 | mov r0, #-1 | |
466 | b L_done | |
467 | ||
468 | ||
469 | .align 4,0x90 | |
470 | L_RECORD_EXACT: | |
471 | /* | |
472 | we have an exact match of the input word to its corresponding dictionary word | |
473 | write tag/dict_index to the temorary buffers | |
474 | */ | |
475 | sub edx, dict_location, dictionary // dict_location - dictionary | |
476 | mov eax, #3 | |
477 | lsr edx, edx, #2 // divide by 4 for word offset | |
478 | strb eax, [next_tag], #1 // *next_tag++ = 3 for exact | |
479 | strb edx, [next_qp], #1 // *next_qp = word offset (4-bit) | |
480 | subs remaining, remaining, #1 // remaining input words | |
481 | bgt L_scan_loop // if remaining>0, go on the scan/tag pass, | |
482 | b CHECKPOINT // if remaining = 0, break | |
483 | ||
484 | .align 4,0x90 | |
485 | L_RECORD_PARTIAL: | |
486 | /* | |
487 | we have a partial (high 22-bits) match of the input word to its corresponding dictionary word | |
488 | write tag/dict_index/low 10 bits to the temorary buffers | |
489 | */ | |
490 | sub eax, dict_location, dictionary // dict_location - dictionary | |
491 | str edx, [dict_location] // *dict_location = input_word; | |
492 | lsr eax, eax, #2 // offset in 32-bit word | |
493 | lsl edx, edx, #22 | |
494 | strb eax, [next_qp], #1 // update *next_qp++ | |
495 | mov eax, #1 | |
496 | lsr edx, edx, #22 // lower 10 bits | |
497 | strb eax, [next_tag], #1 // *next_tag++ = 1 for partial matched | |
498 | strh edx, [next_low_bits], #2 // save next_low_bits++ | |
499 | subs remaining, remaining, #1 // remaining input words | |
500 | bgt L_scan_loop // if remaining>0, go on the scan/tag pass, | |
501 | ||
502 | CHECKPOINT: | |
503 | ldr eax, mode // load the mode | |
504 | cmp eax, #EARLYCHECK | |
505 | beq L_check_compression_ratio // early abort check | |
506 | ||
507 | L_check_zero_page: | |
508 | ||
509 | ldr eax, start_next_full_patt // check if any dictionary misses in page | |
510 | cmp eax, next_full_patt | |
511 | bne L_check_single_value_page | |
512 | ||
513 | add eax, scratch, #QPosArray_offset // get start_next_qp | |
514 | cmp eax, next_qp // check if any partial or exact dictionary matches | |
515 | ||
516 | moveq r0, #SV_RETURN // Magic return value | |
517 | beq L_done | |
518 | ||
519 | L_check_single_value_page: | |
520 | ||
521 | ldr eax, start_next_full_patt // get # dictionary misses | |
522 | sub eax, next_full_patt, eax | |
523 | lsr eax, eax, #2 | |
524 | ||
525 | add R11, scratch, #QPosArray_offset // get start_next_qp | |
526 | sub R11, next_qp, R11 // get # dictionary hits (exact + partial) | |
527 | ||
528 | ldr R13, start_next_low_bits | |
529 | sub R13, next_low_bits, R13 // get # dictionary partial hits | |
530 | lsrs R13, R13, #1 | |
531 | ||
532 | // Single value page if one of the follwoing is true: | |
533 | // partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss) | |
534 | // partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial) | |
535 | // | |
536 | bne 1f // were there 0 partial hits? | |
537 | ||
538 | mov edx, #1023 | |
539 | cmp R11, edx // were there 1023 dictionary hits | |
540 | bne 1f | |
541 | ||
542 | cmp eax, #1 // was there exacly 1 dictionary miss? | |
543 | bne 1f | |
544 | ||
545 | ldrb edx, [tempTagsArray] // read the very 1st tag | |
546 | cmp edx, #2 // was the very 1st tag a miss? | |
547 | beq L_is_single_value_page | |
548 | ||
549 | 1: | |
550 | cmp R13, #1 // was there 1 partial hit? | |
551 | bne L_check_mostly_zero | |
552 | ||
553 | mov edx, #1024 | |
554 | cmp R11, edx // were there 1024 dictionary hits | |
555 | bne L_check_mostly_zero | |
556 | ||
557 | ldrb edx, [tempTagsArray] // read the very 1st tag | |
558 | cmp edx, #1 // was the very 1st tag a partial? | |
559 | bne L_is_single_value_page | |
560 | ||
561 | L_is_single_value_page: | |
562 | ||
563 | moveq r0, #SV_RETURN // Magic return value | |
564 | beq L_done | |
565 | ||
566 | L_check_mostly_zero: | |
567 | // how much space will the sparse packer take? | |
568 | add eax, eax, R11 // eax += (next_qp - start_next_qp) | |
569 | mov edx, #6 | |
570 | mov R11, #4 | |
571 | mla R11, eax, edx, R11 // R11 = eax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header | |
572 | ||
573 | ldr eax, start_next_low_bits | |
574 | sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits | |
575 | mov edx, #1365 | |
576 | mul eax, eax, edx | |
577 | ||
578 | ldr edx, start_next_full_patt | |
579 | sub edx, next_full_patt, edx // get bytes consumed by dictionary misses | |
580 | add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) | |
581 | ||
582 | add edx, scratch, #QPosArray_offset // get start_next_qp | |
583 | sub edx, next_qp, edx | |
584 | add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2 | |
585 | mov edx, #(12+256) | |
586 | add eax, eax, edx // rax += bytes taken by the header + tags | |
587 | ||
588 | cmp eax, R11 // is the default packer the better option? | |
589 | blt L_done_search | |
590 | ||
591 | ldr edx, byte_budget | |
592 | cmp R11, edx // can the sparse packer fit into the given budget? | |
593 | bgt L_budgetExhausted | |
594 | ||
595 | L_sparse_packer: | |
596 | ||
597 | mov edx, #MZV_MAGIC | |
598 | str edx, [dest_buf], #4 // header to indicate a sparse packer | |
599 | ||
600 | ldr R13, start_next_input_word // get the starting address of src | |
601 | mov edx, #0 | |
602 | mov ecx, #4096 | |
603 | ||
604 | 1: | |
605 | ldm R13!, {r2, r3, r5, r6, r7, r8, r9, r10} | |
606 | ||
607 | teq r2, #0 | |
608 | teqeq r3, #0 | |
609 | teqeq r5, #0 | |
610 | teqeq r6, #0 | |
611 | teqeq r7, #0 | |
612 | teqeq r8, #0 | |
613 | teqeq r9, #0 | |
614 | teqeq r10, #0 | |
615 | ||
616 | bne 2f | |
617 | subs ecx, ecx, #32 | |
618 | add edx, edx, #32 // 16 more bytes have been processed | |
619 | bne 1b | |
620 | mov r0, R11 // store the size of the compressed stream | |
621 | b L_done | |
622 | ||
623 | 2: | |
624 | teq r2, #0 | |
625 | strne r2, [dest_buf], #4 // store the non-0 word in the dest buffer | |
626 | strhne edx, [dest_buf], #2 // store the byte index | |
627 | add edx, edx, 4 | |
628 | ||
629 | teq r3, #0 | |
630 | strne r3, [dest_buf], #4 // store the non-0 word in the dest buffer | |
631 | strhne edx, [dest_buf], #2 // store the byte index | |
632 | add edx, edx, 4 | |
633 | ||
634 | teq r5, #0 | |
635 | strne r5, [dest_buf], #4 // store the non-0 word in the dest buffer | |
636 | strhne edx, [dest_buf], #2 // store the byte index | |
637 | add edx, edx, 4 | |
638 | ||
639 | teq r6, #0 | |
640 | strne r6, [dest_buf], #4 // store the non-0 word in the dest buffer | |
641 | strhne edx, [dest_buf], #2 // store the byte index | |
642 | add edx, edx, 4 | |
643 | ||
644 | teq r7, #0 | |
645 | strne r7, [dest_buf], #4 // store the non-0 word in the dest buffer | |
646 | strhne edx, [dest_buf], #2 // store the byte index | |
647 | add edx, edx, 4 | |
648 | ||
649 | teq r8, #0 | |
650 | strne r8, [dest_buf], #4 // store the non-0 word in the dest buffer | |
651 | strhne edx, [dest_buf], #2 // store the byte index | |
652 | add edx, edx, 4 | |
653 | ||
654 | teq r9, #0 | |
655 | strne r9, [dest_buf], #4 // store the non-0 word in the dest buffer | |
656 | strhne edx, [dest_buf], #2 // store the byte index | |
657 | add edx, edx, 4 | |
658 | ||
659 | teq r10, #0 | |
660 | strne r10, [dest_buf], #4 // store the non-0 word in the dest buffer | |
661 | strhne edx, [dest_buf], #2 // store the byte index | |
662 | add edx, edx, 4 | |
663 | ||
664 | subs ecx, ecx, #32 | |
665 | bne 1b | |
666 | mov r0, R11 // store the size of the compressed stream | |
667 | b L_done | |
668 | ||
669 | L_check_compression_ratio: | |
670 | ||
671 | mov eax, #NORMAL | |
672 | str eax, mode | |
673 | mov remaining, #(1024 - CHKPT_WORDS) // remaining input words to process | |
674 | cmp remaining, #0 | |
675 | beq CHECKPOINT // if there are no remaining words to process | |
676 | ||
677 | ldr eax, start_next_low_bits | |
678 | sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits | |
679 | mov edx, #1365 | |
680 | mul eax, eax, edx | |
681 | ||
682 | ldr edx, start_next_full_patt | |
683 | sub edx, next_full_patt, edx // get bytes consumed by dictionary misses | |
684 | add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) | |
685 | ||
686 | add edx, scratch, #QPosArray_offset // get start_next_qp | |
687 | sub edx, next_qp, edx | |
688 | add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2 | |
689 | mov edx, #(CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES) | |
690 | subs eax, eax, edx // eax += CHKPT_TAG_BYTES; eax -= CHKPT_SHRUNK_BYTES | |
691 | bgt L_budgetExhausted // if eax is > 0, we need to early abort | |
692 | b L_scan_loop // we are done | |
693 | ||
694 | ||
695 | #if defined(KERNEL) && !SLIDABLE | |
696 | .align 2 | |
697 | L_table: | |
698 | .long _hashLookupTable_new | |
699 | #else | |
700 | .align 2 | |
701 | L_table: | |
702 | .long L_Tab$non_lazy_ptr-(L_table0+8) | |
703 | ||
704 | .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers | |
705 | .align 2 | |
706 | L_Tab$non_lazy_ptr: | |
707 | .indirect_symbol _hashLookupTable_new | |
708 | .long 0 | |
709 | #endif | |
710 |