]> git.saurik.com Git - apple/xnu.git/blame - osfmk/arm/WKdmCompress_new.s
xnu-4570.71.2.tar.gz
[apple/xnu.git] / osfmk / arm / WKdmCompress_new.s
CommitLineData
5ba3f43e
A
1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains armv7 hand optimized implementation of WKdm memory page compressor.
31
32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
33
34 input :
35 src_buf : address of input page (length = 1024 words)
36 dest_buf : address of output buffer (may not be 16-byte aligned)
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
38 bytes_budget : a given byte target in compression
39
40 output :
41
42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
47
48 The 4th argument bytes_budget is the target compress budget in bytes.
49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
51
52 WKdm Compression algorithm is briefly stated as follows:
53
54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
56 a. k = (x>>10)&255; // 8-bit hash table index
57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed
58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated
59
60 Each input word x is classified/tagged into 4 classes :
61 0 : x = 0
62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
64 3 : (x == dict_word), the exact input word is in the dictionary
65
66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
67 0 : 2-bits tag (32->2 compression)
68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
69 2 : 2-bits tag + 32-bits new word (32->34 expansion)
70 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
71
72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
73
74 the output bit stream (*dest_buf) consists of
75 a. 12 bytes header
76 b. 256 bytes for 1024 packed tags
77 c. (varying number of) words for new words not matched to dictionary word.
78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
80
81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
83
84
85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
86
87 The temp buffers are
88
89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing
90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing
91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing
92
93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
94 the destination buffer.
95
96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags.
97
98 Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
99
100 byte_count = bytes_budget - 12 - 256; // header + tags
101
102 whenever an input word is classified as class
103
104 2 : byte_count -= 4;
105
106 in 4-bit/10-bit packing, we can also return -1 when byte_budget <=0;
107
108 Note : since there might be extra padding bits for class 1 and 3, it is complicated to track this padding bits on the fly. To compromise, we change class 1 to
109
110 without showing the bit budget management, the pseudo code is given as follows:
111
112 uint8_t *tags=tempTagsArray;
113 uint8_t *dict=tempQPosArray;
114 uint8_t *partial=tempLowBitsArray;
115
116 for (i=0;i<1024;i++) {
117 x = *src_buf++;
118 if (x == 0) { // zero, 2-bits tag
119 *tags++ = 0;
120 } else {
121
122 // find dict_index and dict_word from x
123 k = (x>>10)&255;
124 dict_index = hashTable[k];
125 dict_word = dictionary[dict_index];
126
127 if (dict_word == x) { // exactly match
128 // 2-bits tag + 4-bits table index
129 *tags++ = 3;
130 *dict++ = dict_index;
131 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched
132 // 2-bits tag + 4-bits table index + 10-bits lower partial
133 *tags++ = 1;
134 *dict++ = dict_index;
135 *partial++ = x &0x3ff;
136 dictionary[dict_index] = x;
137 } else { // not matched
138 // 2-bits tag + 32-bits new word
139 *tags++ = 2;
140 *new_word++ = x;
141 dictionary[dict_index] = x;
142 }
143 }
144 }
145
146 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
147
148 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
149 2. dictionary indices (4-bits each) are packed into are right after the new words section
150 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
151
152 cclee, 11/9/12
153
154 Added zero page, single value page, sparse page, early abort optimizations
155 rsrini, 09/14/14
156*/
157 .text
158 .align 4
159
160 // int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
161
162.globl _WKdm_compress_new
163_WKdm_compress_new:
164
165/*
166 ------------------------- symbolizing register use -----------------------------------
167*/
168
169 #define src_buf r0
170 #define next_input_word r0
171 #define dest_buf r1
172 #define scratch r2
173 #define dictionary sp
174 #define byte_count r3
175
176 #define next_tag r12
177
178 #define remaining r4
179 #define next_full_patt r5
180 #define dict_location r6
181 #define next_qp r8
182 #define hashTable r9
183 #define next_low_bits r10
184 #define eax r11
185 #define ecx r12
186 #define edx lr
187 #define rdi r6
188
189 #define tempTagsArray scratch
190 #define R11 r0 // only safe to use between phase-1 and phase-2
191 #define R13 r4 // only safe to use between phase-1 and phase-2
192/*
193 ------------------------- allocate scratch memory for local use --------------------------------------
194
195 need 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*2 (tempLowBitsArray)
196 total 4096 bytes
197 [scratch,#0] : tempTagsArray
198 [scratch,#1024] : tempQPosArray
199 [scratch,#2048] : tempLowBitsArray
200
201 [sp,#0] : dictionary
202
203*/
204
205 #define TagsArray_offset 0
206 #define QPosArray_offset 1024
207 #define LowBitsArray_offset 2048
208
209 #define SV_RETURN 0 // return value when SV, ZV page is found
210 #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
211 #define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
212 #define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words
213 #define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data
214 #define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing ..
215 // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
216#if CHKPT_BYTES > 4096
217 #error CHKPT_BYTES must be <= 4096
218#endif
219#if CHKPT_BYTES < 4
220 #error CHKPT_BYTES must be >= 4
221#endif
222
223 push {r7,lr}
224 mov r7, sp
225 push {r4-r6,r8-r11}
226
227#if KERNEL
228 sub sp, sp, #32
229 vst1.64 {q0,q1}, [sp]
230#endif
231
232 sub sp, sp, #(64+24) // reserve stack space for temps + dictionary
233
234/*
235 ----- set up registers and initialize WKdm dictionary ----------
236*/
237 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
238 // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
239 mov eax, #0
240
241 mov next_tag, scratch // &tempTagsArray[0]
242 vdup.32 q0, eax
243
244 add next_qp, scratch, #QPosArray_offset // next_qp
245 mov lr, sp
246 mov remaining, #(CHKPT_WORDS) // remaining input words .. initially set to checkpoint
247 vst1.64 {q0}, [lr]!
248 add next_full_patt, dest_buf, #268 // dest_buf + [TAGS_AREA_OFFSET + (4096 / 16)]*4
249 vst1.64 {q0}, [lr]!
250 vst1.64 {q0}, [lr]!
251 add next_low_bits, scratch, #LowBitsArray_offset // &tempLowBitsArray[0]
252 vst1.64 {q0}, [lr]!
253
254#if defined(KERNEL) && !SLIDABLE
255 adr hashTable, L_table
256 ldr hashTable, [hashTable]
257#else
258 ldr hashTable, L_table
259L_table0:
260 ldr hashTable, [pc, hashTable]
261#endif
262
263#define EARLYCHECK 0
264#define NORMAL 1
265
266#define mode [sp, #64]
267#define start_next_full_patt [sp, #68]
268#define start_next_input_word [sp, #72]
269#define start_next_low_bits [sp, #76]
270#define byte_budget [sp, #80]
271
272 mov edx, #EARLYCHECK
273 str edx, mode // indicate we are yet to evaluate the early aborts
274 str next_full_patt, start_next_full_patt // remember the start of next_full_patt
275 str next_input_word, start_next_input_word // remember the start of next_input_word
276 str next_low_bits, start_next_low_bits // remember the start of next_low_bits
277 str byte_count, byte_budget // remember the byte budget
278
279 sub byte_count, byte_count, #(12+256) // byte_count - header bytes - tags bytes
280 b L_scan_loop
281
282 .align 4, 0x90
283L_RECORD_ZERO:
284 /* we've just detected a zero input word in edx */
285 strb edx, [next_tag], #1 // *next_tag++ = ZERO;
286 subs remaining, remaining, #1 // remaining input words
287 ble CHECKPOINT // if remaining = 0, break
288
289 /* WKdm compress scan/tag loop */
290L_scan_loop:
291 ldr edx, [next_input_word], #4
292 cmp edx, #0
293 beq L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
294
295 /*
296 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
297 */
298 and eax, edx, #(0xff<<10) // part of input_word for hash table index
299 lsr eax, eax, #10 // 8-bit index to the Hash Table
300 ldrb eax, [hashTable, eax] // HASH_TO_DICT_BYTE_OFFSET(input_word)
301 add dict_location, dictionary, eax // ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
302 ldr eax, [dictionary, eax] // dict_word = *dict_location;
303 cmp eax, edx // dict_word vs input_word
304 beq L_RECORD_EXACT // if identical, RECORD_EXACT
305
306 eor eax, eax, edx
307 lsrs eax, eax, #10 // HIGH_BITS(dict_word)
308 beq L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
309
310L_RECORD_MISS:
311/*
312 if we are here, the input word can not be derived from the dictionary,
313 we write the input word as a new word,
314 and update the dictionary with this new word
315*/
316
317 subs byte_count, byte_count, #4
318 ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
319 str edx, [next_full_patt], #4 // *next_full_patt++ = input_word;
320 mov eax, #2
321 str edx, [dict_location] // *dict_location = input_word
322 strb eax, [next_tag], #1 // *next_tag++ = 2 for miss
323 subs remaining, remaining, #1 // remaining input words
324 bgt L_scan_loop // if bit_count>0, go on the scan/tag pass,
325 b CHECKPOINT
326
327L_done_search:
328
329 // SET_QPOS_AREA_START(dest_buf,next_full_patt);
330 sub eax, next_full_patt, dest_buf // next_full_patt - dest_buf
331 lsr eax, eax, #2 // offset in 4-bytes
332 str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf
333
334
335 /* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/
336 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
337
338 add rdi, dest_buf, #12 // dest_buf
339 mov eax, scratch // &tempTagsArray[0]
340 sub edx, next_tag, scratch // this should be 1024
341
342 vld1.64 {q0,q1}, [eax,:128]!
343 subs edx, edx, #32 // pre-decrement by 32
344L_pack_2bits:
345 subs edx, edx, #32
346 vshl.i64 d1, d1, #4
347 vshl.i64 d3, d3, #4
348 vorr d0, d0, d1
349 vorr d2, d2, d3
350 vshr.u64 d1, d0, #30
351 vshr.u64 d3, d2, #30
352 vorr d0, d0, d1
353 vorr d2, d2, d3
354 vzip.32 d0, d2
355 vst1.64 {d0}, [rdi]!
356 vld1.64 {q0,q1}, [eax,:128]!
357 bgt L_pack_2bits
358 vshl.i64 d1, d1, #4
359 vshl.i64 d3, d3, #4
360 vorr d0, d0, d1
361 vorr d2, d2, d3
362 vshr.u64 d1, d0, #30
363 vshr.u64 d3, d2, #30
364 vorr d0, d0, d1
365 vorr d2, d2, d3
366 vzip.32 d0, d2
367 vst1.64 {d0}, [rdi]
368
369
370 /* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */
371
372 /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
373 add ecx, scratch, #QPosArray_offset // tempQPosArray
374 sub eax, next_qp, ecx // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
375 add eax, eax, #7 // num_bytes_to_pack+7
376 lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3
377 subs byte_count, byte_count, eax, lsl #2 // byte_count -= 4 * packed_words
378 blt L_budgetExhausted // o.w., return -1 to signal this page is not compressable
379 add ecx, ecx, eax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words
380 cmp ecx, next_qp // endQPosArray vs next_qp
381 bls L16 // if (next_qp >= endQPosArray) skip the following zero paddings
382 sub eax, ecx, next_qp
383 mov edx, #0
384 tst eax, #4
385 beq 1f
386 str edx, [next_qp], #4
3871: tst eax, #2
388 beq 1f
389 strh edx, [next_qp], #2
3901: tst eax, #1
391 beq 1f
392 strb edx, [next_qp], #1
3931:
394L16:
395 add edx, scratch, #QPosArray_offset // tempQPosArray
396 mov rdi, next_full_patt // next_full_patt
397 cmp ecx, edx // endQPosArray vs tempQPosArray
398 ldr eax, [dest_buf]
399 bls L20 // if (endQPosArray <= tempQPosArray) skip the following
400
401 /* packing 4-bits dict indices into dest_buf */
402L_pack_4bits:
403 vld1.64 {d0}, [edx,:64]! // src_next[1]:src_next[0]
404 vshr.u64 d1, d0, #28 // (src_next[1] << 4)
405 vorr d0, d0, d1 // src_next[0] | (src_next[1] << 4)
406 cmp ecx, edx // source_end vs src_next
407 vstr s0, [rdi]
408 add rdi, rdi, #4
409 bhi L_pack_4bits // while (src_next < source_end) repeat the loop
410
411 /* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */
412 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
413 sub eax, rdi, dest_buf // boundary_tmp - dest_buf
414 lsr eax, eax, #2 // boundary_tmp - dest_buf in words
415L20:
416 str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf
417
418 add ecx, scratch, #LowBitsArray_offset // tempLowBitsArray
419 sub edx, next_low_bits, ecx // next_low_bits - tempLowBitsArray (in bytes)
420 lsr edx, edx, #1 // num_tenbits_to_pack (in half-words)
421 subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3
422 blt 1f // if num_tenbits_to_pack < 3, skip the following loop
4230:
424 subs byte_count, byte_count, #4 // byte_count -= 4
425 ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
426 ldr r4,[ecx, #2] // w2:6bits:w1
427 ldrh r0,[ecx], #6 // w0
428 uxth r5, r4, ror #16 // w2
429 uxth r4, r4 // w1
430 orr r0, r0, r4, lsl #10 // w1:w0
431 subs edx, edx, #3 // num_tenbits_to_pack-=3
432 orr r0, r0, r5, lsl #20 // w2:w1:w0
433 str r0, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word
434 bge 0b // if no less than 3 elements, back to loop head
435
4361: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3
437 beq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
438 subs byte_count, byte_count, #4 // byte_count -= 4
439 ble L_budgetExhausted // o.w., return -1 to signal this page is not compressable
440 ldrh eax,[ecx] // w0
441 subs edx, edx, #1 // num_tenbits_to_pack--
442 beq 2f //
443 ldrh edx, [ecx, #2] // w1
444 orr eax, eax, edx, lsl #10 // w0 | (w1<<10)
445
4462: str eax, [rdi], #4 // write the final dest_buf word
447
4483: sub eax, rdi, dest_buf // boundary_tmp - dest_buf
449 lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words
450 str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
451 lsl r0, eax, #2 // boundary_tmp - dest_buf in terms of bytes
452
453L_done:
454 // restore registers and return
455
456 add sp, sp, #(64+24) // skip memory for temps + dictionary
457#if KERNEL
458 vld1.64 {q0,q1}, [sp]!
459#endif
460 pop {r4-r6,r8-r11}
461 pop {r7,pc}
462
463 .align 4
464L_budgetExhausted:
465 mov r0, #-1
466 b L_done
467
468
469 .align 4,0x90
470L_RECORD_EXACT:
471/*
472 we have an exact match of the input word to its corresponding dictionary word
473 write tag/dict_index to the temorary buffers
474*/
475 sub edx, dict_location, dictionary // dict_location - dictionary
476 mov eax, #3
477 lsr edx, edx, #2 // divide by 4 for word offset
478 strb eax, [next_tag], #1 // *next_tag++ = 3 for exact
479 strb edx, [next_qp], #1 // *next_qp = word offset (4-bit)
480 subs remaining, remaining, #1 // remaining input words
481 bgt L_scan_loop // if remaining>0, go on the scan/tag pass,
482 b CHECKPOINT // if remaining = 0, break
483
484 .align 4,0x90
485L_RECORD_PARTIAL:
486/*
487 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
488 write tag/dict_index/low 10 bits to the temorary buffers
489*/
490 sub eax, dict_location, dictionary // dict_location - dictionary
491 str edx, [dict_location] // *dict_location = input_word;
492 lsr eax, eax, #2 // offset in 32-bit word
493 lsl edx, edx, #22
494 strb eax, [next_qp], #1 // update *next_qp++
495 mov eax, #1
496 lsr edx, edx, #22 // lower 10 bits
497 strb eax, [next_tag], #1 // *next_tag++ = 1 for partial matched
498 strh edx, [next_low_bits], #2 // save next_low_bits++
499 subs remaining, remaining, #1 // remaining input words
500 bgt L_scan_loop // if remaining>0, go on the scan/tag pass,
501
502CHECKPOINT:
503 ldr eax, mode // load the mode
504 cmp eax, #EARLYCHECK
505 beq L_check_compression_ratio // early abort check
506
507L_check_zero_page:
508
509 ldr eax, start_next_full_patt // check if any dictionary misses in page
510 cmp eax, next_full_patt
511 bne L_check_single_value_page
512
513 add eax, scratch, #QPosArray_offset // get start_next_qp
514 cmp eax, next_qp // check if any partial or exact dictionary matches
515
516 moveq r0, #SV_RETURN // Magic return value
517 beq L_done
518
519L_check_single_value_page:
520
521 ldr eax, start_next_full_patt // get # dictionary misses
522 sub eax, next_full_patt, eax
523 lsr eax, eax, #2
524
525 add R11, scratch, #QPosArray_offset // get start_next_qp
526 sub R11, next_qp, R11 // get # dictionary hits (exact + partial)
527
528 ldr R13, start_next_low_bits
529 sub R13, next_low_bits, R13 // get # dictionary partial hits
530 lsrs R13, R13, #1
531
532 // Single value page if one of the follwoing is true:
533 // partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
534 // partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
535 //
536 bne 1f // were there 0 partial hits?
537
538 mov edx, #1023
539 cmp R11, edx // were there 1023 dictionary hits
540 bne 1f
541
542 cmp eax, #1 // was there exacly 1 dictionary miss?
543 bne 1f
544
545 ldrb edx, [tempTagsArray] // read the very 1st tag
546 cmp edx, #2 // was the very 1st tag a miss?
547 beq L_is_single_value_page
548
5491:
550 cmp R13, #1 // was there 1 partial hit?
551 bne L_check_mostly_zero
552
553 mov edx, #1024
554 cmp R11, edx // were there 1024 dictionary hits
555 bne L_check_mostly_zero
556
557 ldrb edx, [tempTagsArray] // read the very 1st tag
558 cmp edx, #1 // was the very 1st tag a partial?
559 bne L_is_single_value_page
560
561L_is_single_value_page:
562
563 moveq r0, #SV_RETURN // Magic return value
564 beq L_done
565
566L_check_mostly_zero:
567 // how much space will the sparse packer take?
568 add eax, eax, R11 // eax += (next_qp - start_next_qp)
569 mov edx, #6
570 mov R11, #4
571 mla R11, eax, edx, R11 // R11 = eax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
572
573 ldr eax, start_next_low_bits
574 sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits
575 mov edx, #1365
576 mul eax, eax, edx
577
578 ldr edx, start_next_full_patt
579 sub edx, next_full_patt, edx // get bytes consumed by dictionary misses
580 add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
581
582 add edx, scratch, #QPosArray_offset // get start_next_qp
583 sub edx, next_qp, edx
584 add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2
585 mov edx, #(12+256)
586 add eax, eax, edx // rax += bytes taken by the header + tags
587
588 cmp eax, R11 // is the default packer the better option?
589 blt L_done_search
590
591 ldr edx, byte_budget
592 cmp R11, edx // can the sparse packer fit into the given budget?
593 bgt L_budgetExhausted
594
595L_sparse_packer:
596
597 mov edx, #MZV_MAGIC
598 str edx, [dest_buf], #4 // header to indicate a sparse packer
599
600 ldr R13, start_next_input_word // get the starting address of src
601 mov edx, #0
602 mov ecx, #4096
603
6041:
605 ldm R13!, {r2, r3, r5, r6, r7, r8, r9, r10}
606
607 teq r2, #0
608 teqeq r3, #0
609 teqeq r5, #0
610 teqeq r6, #0
611 teqeq r7, #0
612 teqeq r8, #0
613 teqeq r9, #0
614 teqeq r10, #0
615
616 bne 2f
617 subs ecx, ecx, #32
618 add edx, edx, #32 // 16 more bytes have been processed
619 bne 1b
620 mov r0, R11 // store the size of the compressed stream
621 b L_done
622
6232:
624 teq r2, #0
625 strne r2, [dest_buf], #4 // store the non-0 word in the dest buffer
626 strhne edx, [dest_buf], #2 // store the byte index
627 add edx, edx, 4
628
629 teq r3, #0
630 strne r3, [dest_buf], #4 // store the non-0 word in the dest buffer
631 strhne edx, [dest_buf], #2 // store the byte index
632 add edx, edx, 4
633
634 teq r5, #0
635 strne r5, [dest_buf], #4 // store the non-0 word in the dest buffer
636 strhne edx, [dest_buf], #2 // store the byte index
637 add edx, edx, 4
638
639 teq r6, #0
640 strne r6, [dest_buf], #4 // store the non-0 word in the dest buffer
641 strhne edx, [dest_buf], #2 // store the byte index
642 add edx, edx, 4
643
644 teq r7, #0
645 strne r7, [dest_buf], #4 // store the non-0 word in the dest buffer
646 strhne edx, [dest_buf], #2 // store the byte index
647 add edx, edx, 4
648
649 teq r8, #0
650 strne r8, [dest_buf], #4 // store the non-0 word in the dest buffer
651 strhne edx, [dest_buf], #2 // store the byte index
652 add edx, edx, 4
653
654 teq r9, #0
655 strne r9, [dest_buf], #4 // store the non-0 word in the dest buffer
656 strhne edx, [dest_buf], #2 // store the byte index
657 add edx, edx, 4
658
659 teq r10, #0
660 strne r10, [dest_buf], #4 // store the non-0 word in the dest buffer
661 strhne edx, [dest_buf], #2 // store the byte index
662 add edx, edx, 4
663
664 subs ecx, ecx, #32
665 bne 1b
666 mov r0, R11 // store the size of the compressed stream
667 b L_done
668
669L_check_compression_ratio:
670
671 mov eax, #NORMAL
672 str eax, mode
673 mov remaining, #(1024 - CHKPT_WORDS) // remaining input words to process
674 cmp remaining, #0
675 beq CHECKPOINT // if there are no remaining words to process
676
677 ldr eax, start_next_low_bits
678 sub eax, next_low_bits, eax // get bytes consumed by lower-10 bits
679 mov edx, #1365
680 mul eax, eax, edx
681
682 ldr edx, start_next_full_patt
683 sub edx, next_full_patt, edx // get bytes consumed by dictionary misses
684 add eax, edx, eax, lsr #11 // eax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
685
686 add edx, scratch, #QPosArray_offset // get start_next_qp
687 sub edx, next_qp, edx
688 add eax, eax, edx, lsr #1 // eax += (next_qp - start_next_qp)/2
689 mov edx, #(CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)
690 subs eax, eax, edx // eax += CHKPT_TAG_BYTES; eax -= CHKPT_SHRUNK_BYTES
691 bgt L_budgetExhausted // if eax is > 0, we need to early abort
692 b L_scan_loop // we are done
693
694
695#if defined(KERNEL) && !SLIDABLE
696 .align 2
697L_table:
698 .long _hashLookupTable_new
699#else
700 .align 2
701L_table:
702 .long L_Tab$non_lazy_ptr-(L_table0+8)
703
704 .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
705 .align 2
706L_Tab$non_lazy_ptr:
707 .indirect_symbol _hashLookupTable_new
708 .long 0
709#endif
710